# Regular Expressions (regex)

---

## Table of Contents
1. Introduction to Regex
2. Basic Patterns
3. Character Classes
4. Quantifiers
5. Anchors and Boundaries
6. Groups and Capturing
7. re Module Functions
8. Flags and Modifiers
9. Common Patterns
10. Key Points
11. Practice Exercises

---

## 1. Introduction to Regex

Regular expressions are patterns used to match character combinations in strings.

In [None]:
import re

# Basic example
text = "The quick brown fox jumps over the lazy dog"
pattern = r"fox"

match = re.search(pattern, text)
if match:
    print(f"Found '{match.group()}' at position {match.start()}-{match.end()}")

In [None]:
# Raw strings (r"...") for regex
# Without raw string, need to escape backslashes
print("\\d")   # Prints: \d
print(r"\d")   # Prints: \d (raw string)

# Always use raw strings for regex patterns
pattern = r"\d+"  # Match digits

---

## 2. Basic Patterns

In [None]:
# Literal characters
text = "Hello World"
print(re.search(r"World", text))  # Match exact text

In [None]:
# Metacharacters
print("Metacharacters:")
print("  .   - Any character except newline")
print("  ^   - Start of string")
print("  $   - End of string")
print("  *   - 0 or more")
print("  +   - 1 or more")
print("  ?   - 0 or 1")
print("  |   - OR")
print("  []  - Character class")
print("  ()  - Group")
print("  {}  - Quantifier")
print("  \\  - Escape")

In [None]:
# The dot (.) - matches any character
text = "cat cot cut c9t c_t"
matches = re.findall(r"c.t", text)
print(f"c.t matches: {matches}")

In [None]:
# Escaping metacharacters
text = "Price: $10.99"
print(re.search(r"\$\d+\.\d+", text))  # Match $10.99

---

## 3. Character Classes

In [None]:
# Character classes with []
text = "The cat sat on a mat"

print(re.findall(r"[cms]at", text))  # c, m, or s followed by at
print(re.findall(r"[a-z]at", text))  # lowercase letter + at
print(re.findall(r"[A-Za-z]at", text))  # any letter + at

In [None]:
# Negated character class [^...]
text = "cat cot cut c1t c_t"
print(re.findall(r"c[^aeiou]t", text))  # Not a vowel

In [None]:
# Predefined character classes
print("Predefined classes:")
print(r"  \d - Digit [0-9]")
print(r"  \D - Non-digit [^0-9]")
print(r"  \w - Word char [a-zA-Z0-9_]")
print(r"  \W - Non-word char")
print(r"  \s - Whitespace [\t\n\r\f\v ]")
print(r"  \S - Non-whitespace")

In [None]:
# Using predefined classes
text = "Order #12345 on 2024-01-15"

print(f"Digits: {re.findall(r'\d+', text)}")
print(f"Words: {re.findall(r'\w+', text)}")
print(f"Non-digits: {re.findall(r'\D+', text)}")

---

## 4. Quantifiers

In [None]:
# Quantifiers control how many times a pattern matches
print("Quantifiers:")
print("  *     - 0 or more (greedy)")
print("  +     - 1 or more (greedy)")
print("  ?     - 0 or 1 (greedy)")
print("  {n}   - Exactly n")
print("  {n,}  - n or more")
print("  {n,m} - Between n and m")
print("  *?, +?, ?? - Non-greedy versions")

In [None]:
# Examples
text = "ac abc abbc abbbc"

print(f"ab*c: {re.findall(r'ab*c', text)}")   # 0 or more b
print(f"ab+c: {re.findall(r'ab+c', text)}")   # 1 or more b
print(f"ab?c: {re.findall(r'ab?c', text)}")   # 0 or 1 b

In [None]:
# Specific counts
text = "a ab abb abbb abbbb"

print(f"ab{{2}}: {re.findall(r'ab{2}', text)}")      # Exactly 2 b
print(f"ab{{2,}}: {re.findall(r'ab{2,}', text)}")    # 2 or more b
print(f"ab{{1,3}}: {re.findall(r'ab{1,3}', text)}")  # 1 to 3 b

In [None]:
# Greedy vs Non-greedy
text = "<div>content</div>"

# Greedy (default) - matches as much as possible
print(f"Greedy: {re.findall(r'<.*>', text)}")

# Non-greedy (?) - matches as little as possible
print(f"Non-greedy: {re.findall(r'<.*?>', text)}")

---

## 5. Anchors and Boundaries

In [None]:
# ^ and $ anchors
text = "Hello World"

print(f"^Hello: {re.search(r'^Hello', text)}")
print(f"^World: {re.search(r'^World', text)}")
print(f"World$: {re.search(r'World$', text)}")
print(f"Hello$: {re.search(r'Hello$', text)}")

In [None]:
# Word boundaries \b
text = "cat category caterpillar"

print(f"cat: {re.findall(r'cat', text)}")        # All occurrences
print(f"\\bcat\\b: {re.findall(r'\bcat\b', text)}")  # Only whole word

In [None]:
# Start and end of words
text = "unhappy happiness unhappiness"

print(f"Words starting with 'un': {re.findall(r'\bun\w+', text)}")
print(f"Words ending with 'ness': {re.findall(r'\w+ness\b', text)}")

---

## 6. Groups and Capturing

In [None]:
# Basic groups ()
text = "John Smith, Jane Doe"
pattern = r"(\w+) (\w+)"

match = re.search(pattern, text)
if match:
    print(f"Full match: {match.group(0)}")
    print(f"Group 1: {match.group(1)}")
    print(f"Group 2: {match.group(2)}")
    print(f"All groups: {match.groups()}")

In [None]:
# Named groups (?P<name>...)
text = "2024-01-15"
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"

match = re.search(pattern, text)
if match:
    print(f"Year: {match.group('year')}")
    print(f"Month: {match.group('month')}")
    print(f"Day: {match.group('day')}")
    print(f"groupdict: {match.groupdict()}")

In [None]:
# Non-capturing groups (?:...)
text = "http://example.com https://secure.com"

# Without non-capturing group
print(re.findall(r"(https?)://", text))

# With non-capturing group
print(re.findall(r"(?:https?)://\S+", text))

In [None]:
# Alternation with groups
text = "I have a cat and a dog"

print(re.findall(r"cat|dog", text))  # cat OR dog
print(re.findall(r"(cat|dog)", text))  # Same with group

In [None]:
# Backreferences \1, \2, etc.
text = "hello hello world world world"

# Find repeated words
pattern = r"\b(\w+)\s+\1\b"
print(f"Repeated words: {re.findall(pattern, text)}")

---

## 7. re Module Functions

In [None]:
# re.search() - Find first match
text = "cat bat rat"
match = re.search(r"[br]at", text)
print(f"search: {match.group() if match else None}")

In [None]:
# re.match() - Match at beginning only
text = "cat bat rat"

print(f"match 'cat': {re.match(r'cat', text)}")
print(f"match 'bat': {re.match(r'bat', text)}")  # None - not at start

In [None]:
# re.findall() - Find all matches
text = "cat bat rat mat"
print(f"findall: {re.findall(r'[a-z]at', text)}")

In [None]:
# re.finditer() - Iterator of match objects
text = "cat bat rat"
for match in re.finditer(r"\w+", text):
    print(f"Found '{match.group()}' at {match.start()}-{match.end()}")

In [None]:
# re.sub() - Replace matches
text = "Hello World"

print(re.sub(r"World", "Python", text))
print(re.sub(r"\w+", "X", text))  # Replace all words
print(re.sub(r"\w+", "X", text, count=1))  # Replace first only

In [None]:
# re.sub() with function
def double(match):
    return str(int(match.group()) * 2)

text = "1 apple, 2 oranges, 3 bananas"
print(re.sub(r"\d+", double, text))

In [None]:
# re.split() - Split by pattern
text = "one,two;three four"

print(re.split(r"[,;\s]+", text))

In [None]:
# re.compile() - Compile pattern for reuse
pattern = re.compile(r"\d{3}-\d{4}")

texts = ["Call 555-1234", "Fax 555-5678", "No number here"]
for text in texts:
    match = pattern.search(text)
    print(f"{text}: {match.group() if match else 'No match'}")

---

## 8. Flags and Modifiers

In [None]:
# re.IGNORECASE (re.I) - Case insensitive
text = "Hello HELLO hello"

print(f"Without flag: {re.findall(r'hello', text)}")
print(f"With IGNORECASE: {re.findall(r'hello', text, re.IGNORECASE)}")

In [None]:
# re.MULTILINE (re.M) - ^ and $ match line boundaries
text = """First line
Second line
Third line"""

print(f"Without MULTILINE: {re.findall(r'^\w+', text)}")
print(f"With MULTILINE: {re.findall(r'^\w+', text, re.MULTILINE)}")

In [None]:
# re.DOTALL (re.S) - . matches newline too
text = "Hello\nWorld"

print(f"Without DOTALL: {re.search(r'Hello.World', text)}")
print(f"With DOTALL: {re.search(r'Hello.World', text, re.DOTALL)}")

In [None]:
# re.VERBOSE (re.X) - Allow comments and whitespace
pattern = re.compile(r"""
    \d{3}    # Area code
    [-.]     # Separator
    \d{3}    # Exchange
    [-.]     # Separator
    \d{4}    # Number
""", re.VERBOSE)

print(pattern.search("Call 555-123-4567"))

In [None]:
# Combining flags
text = """Hello World
HELLO PYTHON"""

pattern = r"^hello"
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
print(f"Combined flags: {matches}")

---

## 9. Common Patterns

In [None]:
# Email validation
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

emails = ["user@example.com", "invalid@", "test.user@domain.co.uk"]
for email in emails:
    if re.match(email_pattern, email):
        print(f"Valid: {email}")
    else:
        print(f"Invalid: {email}")

In [None]:
# Phone number
phone_pattern = r"\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}"

phones = ["555-123-4567", "(555) 123-4567", "555.123.4567", "5551234567"]
for phone in phones:
    match = re.match(phone_pattern, phone)
    print(f"{phone}: {'Valid' if match else 'Invalid'}")

In [None]:
# URL extraction
url_pattern = r"https?://[\w.-]+(?:/[\w./-]*)?"

text = "Visit https://example.com or http://test.org/page"
print(f"URLs: {re.findall(url_pattern, text)}")

In [None]:
# IP address
ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"

text = "Server at 192.168.1.1 and 10.0.0.1"
print(f"IPs: {re.findall(ip_pattern, text)}")

In [None]:
# Date patterns
date_pattern = r"\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}"

text = "Dates: 2024-01-15 and 01/15/2024"
print(f"Dates: {re.findall(date_pattern, text)}")

---

## 10. Key Points

1. **Raw strings**: Always use r"..." for patterns
2. **search vs match**: search finds anywhere, match only at start
3. **findall**: Returns all matches as list
4. **Character classes**: [abc] or predefined \d, \w, \s
5. **Quantifiers**: *, +, ?, {n,m}
6. **Groups**: () for capturing, (?:) for non-capturing
7. **Anchors**: ^, $, \b for boundaries
8. **Flags**: IGNORECASE, MULTILINE, DOTALL, VERBOSE
9. **compile()**: For reusing patterns

---

## 11. Practice Exercises

In [None]:
# Exercise 1: Extract hashtags from text
# Return list of hashtags (without #)

def extract_hashtags(text):
    pass

# Test: extract_hashtags("I love #Python and #coding!")

In [None]:
# Exercise 2: Validate password
# At least 8 chars, 1 uppercase, 1 lowercase, 1 digit

def is_valid_password(password):
    pass

# Test: is_valid_password("Passw0rd")

In [None]:
# Exercise 3: Convert camelCase to snake_case

def camel_to_snake(text):
    pass

# Test: camel_to_snake("camelCaseToSnakeCase")

In [None]:
# Exercise 4: Find all words with specific length

def find_words_by_length(text, length):
    pass

# Test: find_words_by_length("The quick brown fox jumps", 5)

In [None]:
# Exercise 5: Mask sensitive data
# Mask all but last 4 digits of credit card numbers

def mask_credit_card(text):
    pass

# Test: mask_credit_card("Card: 1234-5678-9012-3456")

---

## Solutions

In [None]:
# Solution 1:
def extract_hashtags(text):
    return re.findall(r"#(\w+)", text)

print(extract_hashtags("I love #Python and #coding!"))

In [None]:
# Solution 2:
def is_valid_password(password):
    if len(password) < 8:
        return False
    if not re.search(r"[A-Z]", password):
        return False
    if not re.search(r"[a-z]", password):
        return False
    if not re.search(r"\d", password):
        return False
    return True

print(f"Passw0rd: {is_valid_password('Passw0rd')}")
print(f"password: {is_valid_password('password')}")

In [None]:
# Solution 3:
def camel_to_snake(text):
    return re.sub(r"([a-z])([A-Z])", r"\1_\2", text).lower()

print(camel_to_snake("camelCaseToSnakeCase"))

In [None]:
# Solution 4:
def find_words_by_length(text, length):
    pattern = rf"\b\w{{{length}}}\b"
    return re.findall(pattern, text)

print(find_words_by_length("The quick brown fox jumps", 5))

In [None]:
# Solution 5:
def mask_credit_card(text):
    def mask(match):
        return "*" * len(match.group(1)) + match.group(2)
    return re.sub(r"(\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?)(\d{4})", mask, text)

print(mask_credit_card("Card: 1234-5678-9012-3456"))