In [2]:
import re

In [None]:
# Metacharacters: Special characters with a reserved meaning in regex, such as ".", "*", "+", "?", etc.

# Character Classes: Representing groups of characters, like digits (\d), letters (\w), spaces (\s), etc.

# Quantifiers: Indicating the number of occurrences of a preceding element, like "*", "+", "{n}", "{m,n}", etc.

# Anchors: Marking the start (^) and end ($) of a string or line to be matched.

# Groups and Capture: Using parentheses to group patterns and capture matched text.

In [None]:
# re.search(pattern, string): Searches for the first occurrence of the pattern in the string.

# re.match(pattern, string): Searches for the pattern only at the beginning of the string.

# re.findall(pattern, string): Returns all occurrences of the pattern as a list of strings.

# re.sub(pattern, replacement, string): Replaces all occurrences of the pattern with the replacement.

In [4]:
# Sample text for demonstration
text = "The quick brown fox jumps over the lazy dog."

# re.search(pattern, string)
match = re.search(r"fox", text)
print(match)
if match:
    print(f"Found 'fox' at position {match.start()} in the text.")
else:
    print("Pattern not found.")

<re.Match object; span=(16, 19), match='fox'>
Found 'fox' at position 16 in the text.


In [6]:
# re.match(pattern, string)
match_at_start = re.match(r"fox", text)
if match_at_start:
    print("Pattern 'The' found at the beginning of the text.")
else:
    print("Pattern 'The' not found at the beginning of the text.")

Pattern 'The' not found at the beginning of the text.


In [7]:
# re.findall(pattern, string)
all_occurrences = re.findall(r"\b\w{4}\b", text)
print("All four-letter words:", all_occurrences)

All four-letter words: ['over', 'lazy']


In [8]:
# re.sub(pattern, replacement, string)
replaced_text = re.sub(r"fox", "cat", text)
print("Replaced text:", replaced_text)

Replaced text: The quick brown cat jumps over the lazy dog.


In [17]:
# Some examples where regex can be used

In [14]:
def validate_email(email):
    pattern = r'^[\w\.-]+@[a-zA-Z\d\.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email)

# Test
if validate_email("shabbir@email.com"):
    print("Valid email address.")
else:
    print("Invalid email address.")

Valid email address.


In [15]:
def extract_phone_numbers(text):
    pattern = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    return re.findall(pattern, text)

# Test
text = "Contact us at 123-456-7890 or 987.654.3210 for inquiries."
print(extract_phone_numbers(text))

['123-456-7890', '987.654.3210']


In [16]:
# Data Validation:
# Example: Validate a Password
#This regex pattern ensures that the password contains at least one letter, one digit, and is at least 8 characters long.

def validate_password(password):
    pattern = r"^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{8,}$"
    return re.match(pattern, password)

# Test
if validate_password("MyPassword"):
    print("Valid password.")
else:
    print("Invalid password.")

Invalid password.


In [17]:
# Text Formatting:
# Example: Format Phone Numbers
#This regex pattern captures and groups digits in sets of three to reformat phone numbers with parentheses and dashes.

def format_phone_numbers(text):
    pattern = r"\b(\d{3})(\d{3})(\d{4})\b"
    return re.sub(pattern, r"(\1) \2-\3", text)

# Test
text = "Contact us at 1234567890 or 987-654-3210 for inquiries."
formatted_text = format_phone_numbers(text)
print(formatted_text)

Contact us at (123) 456-7890 or 987-654-3210 for inquiries.


In [18]:
# URL Extraction from Text:
# Example: Extract URLs from a Web Page
#This regex pattern captures URLs starting with "http://" or "https://" from the given text.
def extract_urls(text):
    pattern = r"https?://\S+"
    return re.findall(pattern, text)

# Test
text = "Check out our website: https://www.example.com and our blog: http://blog.example.com"
urls = extract_urls(text)
print("Extracted URLs:", urls)

Extracted URLs: ['https://www.example.com', 'http://blog.example.com']


In [19]:
# Data Extraction from Logs:
# Example: Extract IP Addresses from Log
# This regex pattern extracts IP addresses in the format "X.X.X.X" from a log entry.
def extract_ip_addresses(log):
    pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
    return re.findall(pattern, log)

# Test
log = "Access from IP 192.168.1.100 at 2023-07-29 10:00:00"
ip_addresses = extract_ip_addresses(log)
print("Extracted IP addresses:", ip_addresses)

Extracted IP addresses: ['192.168.1.100']


In [20]:
# Tokenization:
# Example: Tokenize Sentences
def tokenize_sentences(text):
    pattern = r"[.!?]\s"
    return re.split(pattern, text)

# Test
text = "Hello! How are you? I hope you're doing well."
sentences = tokenize_sentences(text)
print("Tokenized Sentences:", sentences)

Tokenized Sentences: ['Hello', 'How are you', "I hope you're doing well."]


In [21]:
# Removing HTML Tags:
# Example: Remove HTML Tags from Text
# This regex pattern removes all HTML tags from the given text.

def remove_html_tags(text):
    pattern = r"<.*?>"
    return re.sub(pattern, "", text)

# Test
html_text = "<p>Hello, <b>world</b>!</p>"
clean_text = remove_html_tags(html_text)
print("Clean Text:", clean_text)

Clean Text: Hello, world!
