A regex is a sequence of characters that defines a search pattern, in strings matching. 

It is used for searching, replacing and validating text

Use Cases:

a. Data Validation:
    1. Email Validation
    2. Phone Number Validation
    3. Password Validation

b. Search and Replace
    1. Find all the URLs in a document
    2. Censor specific words

c. Text Extraction
    1. Extract dates in different formats
    2. Extract hashtags from social media

d. NLP
    1. Remove Punctuation

In [106]:
import re

In [107]:
text = "Roll No: 17 27 45 78"
pattern = r"\d+"

matches = re.findall(pattern, text)
print(matches)

['17', '27', '45', '78']


In [108]:
### Regex Basic Sytax 

In [109]:
# . (Dot): Matches any character except a new line

text = "I have a cat, a cut and a cot cet cht c\nt"
pattern = r"c.t"

matches = re.findall(pattern, text)
print(matches)

['cat', 'cut', 'cot', 'cet', 'cht']


In [110]:
# ^ (Caret): Matches Start of a String

text1 = "HelloJ World!"
text2 = "Bye World"

pattern = r"^H"

print(bool(re.search(pattern, text1)))
print(bool(re.search(pattern, text2)))

True
False


In [111]:
# $ (Dollar Sign): Matches end of a string

text1 = "Hello World!"
text2 = "Bye World"

pattern = r"d$"

print(bool(re.search(pattern, text1)))
print(bool(re.search(pattern, text2)))

False
True


In [112]:
# [] (Character class): Matches any character inside bracket

text = "Hello World"
pattern = r"[aeiud]"

re.findall(pattern, text)

['e', 'd']

In [113]:
# \d Matches any Digit(0-9)
# \d+ one or more digits

text = "My age is 27 and my phone is 264382325 5"
pattern = r"\d+"

re.findall(pattern, text)

['27', '264382325', '5']

In [114]:
# \D Matches any non-digit

text = "Roll No: 170 27 45 78 wgfwuih"
pattern = r"\D+"

matches = re.findall(pattern, text)
print(matches)

['Roll No: ', ' ', ' ', ' ', ' wgfwuih']


In [115]:
# \w Matches any word character (A-Z, a-z, 0-9)

text = "Regex is a powerful tool_123 74654385"
pattern = r"\w+"
re.findall(pattern, text)

['Regex', 'is', 'a', 'powerful', 'tool_123', '74654385']

In [116]:
# \W Matches any non word character (spaces, punctuation, spl characters, etc)

text = "Hello!, my name, is Debanjan!!!, 76345234 #$^&^$#"
pattern = r"\W+"
re.findall(pattern, text)

['!, ', ' ', ', ', ' ', '!!!, ', ' #$^&^$#']

In [117]:
# \s Matches Whitespaces (Spaces, tabs, newline)

text = "Hello!, my name, is Debanjan!!!, 76345234 #$^&^$#"
pattern = r"\s+"
re.findall(pattern, text)

[' ', ' ', ' ', ' ', ' ', ' ']

In [118]:
# \S Matches any non whitespace character

text = "Hello!, my name, is Debanjan!!!, 76345234 #$^&^$#"
pattern = r"\S+"
re.findall(pattern, text)

['Hello!,', 'my', 'name,', 'is', 'Debanjan!!!,', '76345234', '#$^&^$#']

In [119]:
# \b Matched word boundaries

text = "A cat sat on a scatter mat."
pattern = r"\bcat\b"

re.findall(pattern, text)

['cat']

In [120]:
# () Grouping : Captures specific parts of a match

text = "Today's date is 2025-02-10."

pattern = r"(\d{2})-(\d{2})-(\d{2})"
re.findall(pattern, text)

[('25', '02', '10')]

In [121]:
# | OR: Matches either pattern

text = "I have a cat and a dog."
pattern = r"cat|dog"

re.findall(pattern, text)

['cat', 'dog']

In [122]:
# * Asterisk: Matches 0 or more occurences

text = "gd god good goood"
pattern = r"go*d"



re.findall(pattern, text)

['gd', 'god', 'good', 'goood']

In [123]:
# + Plus: Matches 1 or more occurences


text = "gd god good goood"
pattern = r"go+d"

re.findall(pattern, text)

['god', 'good', 'goood']

In [124]:
# ? Question Mark: Matches 0 or 1 occurences


#colo(u?)r

# color
# colour


text = "I like color and colour colour."
pattern = r"colou?r"

re.findall(pattern, text)

['color', 'colour', 'colour']

In [125]:
# {} Curly Braces: Matches exactly n occurences

text = "Area codes: 123, 456, 7890"
pattern = r"\d{3}"

print(re.findall(pattern, text))

# {n,}: Matches at least n or more occurences

text = "Numbers: 1, 12, 123, 1234"
pattern = r"\d{2,}"

print(re.findall(pattern, text))

# {n,m}: Matches between n and m times

text = "Numbers: 1, 12, 123, 1234, 12345"
pattern = r"\d{2,4}"

print(re.findall(pattern, text))

['123', '456', '789']
['12', '123', '1234']
['12', '123', '1234', '1234']


Basic Regex Functions

## Practical Use case

Data Validation

In [126]:
# email validation

# debanjan.chowdhury@s.amity.edu
# jyothisatya.gk@gmail.com
# debanjan700@outlook.com
# me.biochem@gmail.com
# anushajalasutram999@gmail.com


# rules
# 1. @
# 2. .
# 3. before @ we can have a combination of alphabets,.,numbers
# 4. after @ we can have combination of alphabets,.
# 5. after the . we have a combination of alphabets of size (2,3)




pattern = r"^[a-z0-9._]+@[a-z.]+\.[a-z]{2,3}$"
test_case1 = "debanjan.chowdhury@s.edu"
test_case2 = "jyothisatya.gk@gmail.com"
test_case3 = "debanjan700@outlook.com"
test_case4 = "me.biochem@gmail.com"
test_case5 = "anushajalasutram999@gmail.com"
test_case6 = "debanjan_chowdhury@gmail.com"

print(bool(re.match(pattern, test_case1)))
print(bool(re.match(pattern, test_case2)))
print(bool(re.match(pattern, test_case3)))
print(bool(re.match(pattern, test_case4)))
print(bool(re.match(pattern, test_case5)))
print(bool(re.match(pattern, test_case6)))

True
True
True
True
True
True


In [129]:
# Phone Number Validation ** needs to be checked

# rules
# +91 1234567890
# 91 1234567890
# 1234567890
# 01234567890
# (+91 1234567890)

pattern = r"^\(?\+?(\d{0,2})?\s?\d{10}\)?$"

test_case1 = "+91 1234567890"

print(bool(re.match(pattern, test_case1)))

True


Search and replace

In [130]:
text = "Hello     World!     This is Python."
pattern = r"\s+"

new_text = re.sub(pattern, " ", text)
print(new_text)

Hello World! This is Python.


In [131]:
text = "Credit card number is 1234-5678-9876-5432"

# Credit card number is ****-****-****-5432

pattern = r"\d{4}-\d{4}-\d{4}-(\d{4})"

re.sub(pattern, r"****-****-****-\1", text)

'Credit card number is ****-****-****-5432'

In [132]:
# Hashtags

text = "I love #Python and #AI"
pattern = r"#\w+"
re.findall(pattern, text)

['#Python', '#AI']

In [133]:
# date extraction

text = "Dates are: 10/02/2024, 11-02-2025, 12.02.2025, 13/02/25"
pattern = r"\d{2}[/.-]\d{2}[/.-]\d{2,4}"
re.findall(pattern, text)

['10/02/2024', '11-02-2025', '12.02.2025', '13/02/25']