In [1]:
import re

## Basic Syntax Elements:

### Literal Characters (r'abc')

In [2]:
text = "abc is the beginning of the alphabet and ends with xyz"
pattern = r"b"
match = re.match(pattern, text)
print(match.group() if match else "No match")


No match


In [3]:
text.split()

['abc',
 'is',
 'the',
 'beginning',
 'of',
 'the',
 'alphabet',
 'and',
 'ends',
 'with',
 'xyz']

In [4]:

text = "abc abc abcxyz abcdfd bcv"
pattern = r"c"
matches = re.findall(pattern, text)
print(matches)

['c', 'c', 'c', 'c', 'c']


### Dot (r'a.c')

In [5]:
text = "afc"
pattern = r"a.c"
match = re.match(pattern, text)
print(match.group() if match else "No match")


afc


In [6]:

text = "abc ac axc"
pattern = r"a.c"
matches = re.findall(pattern, text)
print(matches)


['abc', 'axc']


### Caret (^)

In [7]:
text = "abcdef"
pattern = r"^abc"
match = re.match(pattern, text)
print(match.group() if match else "No match")


abc


In [8]:

text = "abcdefabcabcd"
pattern = r"^abc"
matches = re.findall(pattern, text)
print(matches)


['abc']


### Dollar ($)

In [9]:

text = "xyzabc"
pattern = r"abc$"
match = re.search(pattern, text)
print(match.group() if match else "No match")

abc


In [10]:

text = "abc def abc"
pattern = r"abc$"
matches = re.findall(pattern, text)
print(matches)

['abc']


### Square Brackets ([])

In [11]:

text = "hello world all item"
pattern = r"[aioue]"
match = re.search(pattern, text)
print(match.group() if match else "No match")


e


In [12]:
list_symb = []
for letter in text:
    for symb in pattern:
        if letter == symb:
            print(letter)
            list_symb.append(letter)
        

e
o
o
a
i
e


In [13]:

text = "hello world all item"
pattern = r"[aeiou]"
matches = re.findall(pattern, text)
print(matches)


['e', 'o', 'o', 'a', 'i', 'e']


### Hyphen (-)

In [14]:

text = "abc9123 123"
pattern = r"[0-9]"
match = re.search(pattern, text)
print(match.group() if match else "No match")


9


In [15]:

text = "abc123 def456 9"
pattern = r"[0-9]"
matches = re.findall(pattern, text)
print(matches)


['1', '2', '3', '4', '5', '6', '9']


### Asterisk (*)

In [16]:

text = "abcc abccc"
pattern = r"abc*"
match = re.match(pattern, text)
print(match.group() if match else "No match")


abcc


In [17]:
text = "abcc ab ac a bc"
pattern = r"abc*"
matches = re.findall(pattern, text)
print(matches)

['abcc', 'ab']


### Plus (+)

In [18]:
text_1 = "ac abcc ac abc"
text_2 = "abc ac abc"
pattern = r"abc+"
match = re.match(pattern, text_1)
match_2 = re.match(pattern, text_2)
print(match.group() if match else "No match")
print(match_2.group() if match_2 else "No match_2")

No match
abc


In [19]:
text = "abcc abc a abccdhjdsf ab"
pattern = r"abc+"
matches = re.findall(pattern, text)
print(matches)

['abcc', 'abc', 'abcc']


### Question Mark (?)

In [20]:
text = "abdca"
pattern = r"abc?"
match = re.match(pattern, text)
print(match.group() if match else "No match")

ab


In [21]:
text = "abcdfadcsadcs abcds a abcc abgfda"
pattern = r"abcdf+"
matches = re.findall(pattern, text)
print(matches)

['abcdf']


### Parentheses (())

In [22]:
text = "abcabcabcabc abc"
pattern = r"(abc)+"
match = re.match(pattern, text)
print(match.group() if match else "No match")

abcabcabcabc


In [23]:
text = "abcabc abchello abdcc"
pattern = r"(abc)+"
matches = re.findall(pattern, text)
print(matches)

['abc', 'abc']


### Pipe (|)

In [24]:
text = "abc abcdef"
pattern = r"abc|"
match = re.match(pattern, text)
print(match.group() if match else "No match")

abc


In [25]:
text = "abc def abc abcdef"
pattern = r"abc|abcdef|def|abcdef"
matches = re.findall(pattern, text)
print(matches)

['abc', 'def', 'abc', 'abc', 'def']


### Backslash (\)

In [26]:
text = "file.txt"
pattern = r"\."
match = re.search(pattern, text)
print(match.group(0) if match else "No match")

.


In [27]:
text = "file.txt image.jpg"
pattern = r"\."
matches = re.findall(pattern, text)
print(matches)

['.', '.']


## Complex and practical use cases

In [28]:
text_1 = "document.pdf"
pattern = r"\.([a-zA-Z0-9]+)$"
match = re.search(pattern, text_1)
match.group(0)

'.pdf'

In [29]:

file_names = ["document.pdf", "image.jpeg", "data.csv", "README.md", "file"]

pattern = r"\.([a-zA-Z0-9]+)$"

for file in file_names:
    match = re.search(pattern, file)
    if match:
        print(f"File: {file}, Extension: {match.group(1)}")
    else:
        print(f"File: {file}, No extension found")


File: document.pdf, Extension: pdf
File: image.jpeg, Extension: jpeg
File: data.csv, Extension: csv
File: README.md, Extension: md
File: file, No extension found


## Common Character Classes:

### \d

In [310]:

text = "There are 123 apples."

pattern = r"\d"

matches = re.findall(pattern, text)
print(matches)


['1', '2', '3']


### /D

In [112]:
text = "Order number: 123ABC!"
pattern = r"\D"
matches = re.findall(pattern, text)
print(matches)


['O', 'r', 'd', 'e', 'r', ' ', 'n', 'u', 'm', 'b', 'e', 'r', ':', ' ', 'A', 'B', 'C', '!']


### \w

In [114]:
text = "Python_3 is fun!"
pattern = r"\w"
matches = re.findall(pattern, text)
print(matches)

['P', 'y', 't', 'h', 'o', 'n', '_', '3', 'i', 's', 'f', 'u', 'n']


### \W

In [115]:
text = "Hello, world! #Python3"
pattern = r"\W"
matches = re.findall(pattern, text)
print(matches)

[',', ' ', '!', ' ', '#']


### \s

In [116]:
text = "This is\tan example\nwith spaces."
pattern = r"\s"
matches = re.findall(pattern, text)
print(matches)


[' ', '\t', ' ', '\n', ' ']


### \S

In [118]:
text = "This is\tan example\nwith spaces."
pattern = r"\S"
matches = re.findall(pattern, text)
print(matches)


['T', 'h', 'i', 's', 'i', 's', 'a', 'n', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 'w', 'i', 't', 'h', 's', 'p', 'a', 'c', 'e', 's', '.']


## Regex Functions in Python (re module)

### re.match(pattern, string)
* Determines if the pattern matches at the beginning of the string.

In [322]:
text = "Hello, world! Hello, Hello,"
pattern = r"Hello,"

match = re.match(pattern, text)
print(match.group(0) if match else "No match")


Hello,


In [317]:
text.split(' ')

['Hello,', 'world!']

In [318]:
def def_match(pattern, text):
    text_s = text.split()

    for elem in text_s:
        if elem == pattern:
            return elem

In [320]:
def def_find_all(pattern, text):
    all_list = []
    text_s = text.split()

    for elem in text_s:
        if elem == pattern:
            all_list.append(elem)
    return all_list

In [323]:
def_find_all(pattern, text)

['Hello,', 'Hello,', 'Hello,']

In [319]:
def_match(pattern, text)

'Hello,'

In [212]:
match

<re.Match object; span=(0, 6), match='Hello,'>

In [114]:
match.group(0)

'Hello'

In [215]:
text = "Hello, world start"
pattern = r"(Hello), (world) (start)"

match = re.match(pattern, text)
print(match.group(0))  # Entire match
print(match.group(1))  # First captured group ("Hello")
print(match.group(2))  # Second captured group ("world")


Hello, world start
Hello
world


### re.search(pattern, string)
* Searches the string for the first match of the pattern.

In [217]:
text = "The; fox quick, brown fox fox"
pattern = r"fox"

search = re.search(pattern, text)
print(search.group() if search else "No match")


fox


In [220]:
print(search.start())
print(search.end())

5
8


In [221]:
text[search.start(): search.end()]

'fox'

In [188]:
text

'The; fox quick, brown fox'

### re.findall(pattern, string)
* Returns a list of all non-overlapping matches of the pattern in the string.

In [224]:
text = " quick The 12 quick 34 brown 56 foxes. quick "
pattern = r"\d+"  # Matches one or more digits
#pattern = r"quick" 
matches = re.findall(pattern, text)
print(matches)


['12', '34', '56']


### re.finditer(pattern, string)

In [225]:
text = "The 12 quick 34 brown 56 foxes."
pattern = r"\d+"  # Matches one or more digits

matches = re.finditer(pattern, text)
for match in matches:
    print(f"Found {match.group()} at position {match.start()} to {match.end()}")

Found 12 at position 4 to 6
Found 34 at position 13 to 15
Found 56 at position 22 to 24


### re.sub(pattern, repl, string)
Replaces the matched pattern with the specified replacement string.

In [226]:
text = "Hello, worlds!"
pattern = r"world"
replacement = "Python"

new_text = re.sub(pattern, replacement, text)
print(new_text)


Hello, Pythons!


### re.split(pattern, string): Splits the string at every match of the pattern.



In [240]:
text = "apple,banana. cherry,orange?pineapple"
pattern = r"[.;,?]"

split_text = re.split(pattern, text)
print(split_text)


['apple', 'banana', ' cherry', 'orange', 'pineapple']


### re.compile(pattern): Compiles the regex pattern into a regex object, which can be used for repeated matches.

In [243]:
pattern = re.compile(r"The")  # Compile a pattern that matches one or more digits
text = "The numbers The are 123 and 456"

matches = pattern.findall(text)
print(matches)


['The', 'The']


In [244]:
pattern.search('The numbers are 123 and 456').group()

'The'

### Additional and rarely used methods of RE

### 1. re.fullmatch(pattern, string):

In [147]:

text = "abc123"
pattern = r"abc123"

match = re.fullmatch(pattern, text)
print(match.group() if match else "No match")


abc123


### 2. re.purge():
* Clears the internal regex cache, which is useful if you're working with a large number of regex patterns and need to free up memory.


In [146]:

pattern1 = re.compile(r"\d+")
pattern2 = re.compile(r"[a-z]+")

re.purge()


### 3. re.escape(string):
* Escapes all non-alphanumeric characters in a string, making it safe to use in a regex pattern. This is useful when you want to match literal strings that may contain special regex characters (e.g., . or *).

In [158]:
text = "Hello.$ How * are you?"
escaped_text = re.escape(text)
print(escaped_text)

Hello\.\$\ How\ \*\ are\ you\?


In [159]:
pattern = r"\\[.*$^()|?+{}[\]]"  

matches = re.findall(pattern, escaped_text)
print(f"Extracted Special Characters: {matches}")

Extracted Special Characters: ['\\.', '\\$', '\\*', '\\?']


In [160]:

words = re.findall(r'\b\w+\b', text)

print(f"Extracted Words: {words}")


Extracted Words: ['Hello', 'How', 'are', 'you']


### 4.re.subn(pattern, repl, string):
* Similar to re.sub(), but it also returns the number of substitutions made.

In [162]:
import re

text = "I like cats, cats are great!"
pattern = r"cats"
replacement = "dogs"

new_text, num_subs = re.subn(pattern, replacement, text)
print(new_text)  
print(num_subs)  


I like dogs, dogs are great!
2


## Alternatives to Regex

### 1. str.find(): Finds the first occurrence of a substring within a string.

In [30]:
text = "The quick brown fox"
substring = "quick"

index = text.find(substring)
print(index)  # Output: 4


4


In [31]:
text[:5]

'The q'

In [32]:
text[index : index + len(substring)]

'quick'

In [33]:
def find_str(substring, text):

    index = text.find(substring)
    #return len(substring)
    #return text[index:]
    return text[index : index + len(substring)]

In [34]:
find_str(substring, text)

'quick'

### 2. str.replace(): Replaces occurrences of a substring with another substring.


In [35]:
text = "I love apples, apples are great!"
new_text = text.replace("apples", "oranges")
print(new_text)  # Output: I love oranges, oranges are great!


I love oranges, oranges are great!


### 3. str.split(): Splits a string into a list based on a delimiter.


In [36]:
text = "apple,banana,orange"
fruits = text.split(",")
print(fruits)  # Output: ['apple', 'banana', 'orange']


['apple', 'banana', 'orange']


## Alternatives to Regex

In [37]:
from pyparsing import Word, alphas, nums

word = Word(alphas)  # Matches a word of alphabetic characters
number = Word(nums)  # Matches a number of digits

pattern = word + number

text = "Hello 123"
result = pattern.parseString(text)

print(result) 

['Hello', '123']


### Regex

In [38]:
import regex

text = "Café"  # "é" is a Unicode character
pattern = r"\p{L}+"  # Matches one or more Unicode letters

matches = regex.findall(pattern, text)
print(matches)  # Output: ['Café']

['Café']
