In [2]:
import re

## Basic Syntax

- `.`: Matches any single character except newline
- `^`: Matches the start of the string
- `$`: Matches the end of the string
- `*`: Matches 0 or more repetitions of the preceding element
- `+`: Matches 1 or more repetitions of the preceding element
- `?`: Matches 0 or 1 repetition of the preceding element
- `{n}`: Matches exactly n repetitions of the preceding element
- `{n,}`: Matches at least n repetitions of the preceding element
- `{n,m}`: Matches between n and m repetitions of the preceding element
- `|`: Alternation, matches either the pattern before or the pattern after the symbol

## Character Classes

- `[abc]`: Matches any one of the characters a, b, or c
- `[^abc]`: Matches any character that is not a, b, or c
- `[a-z]`: Matches any character from a to z
- `[A-Z]`: Matches any character from A to Z
- `[0-9]`: Matches any digit
- `\d`: Matches any digit (equivalent to [0-9])
- `\D`: Matches any non-digit
- `\w`: Matches any word character (equivalent to [a-zA-Z0-9_])
- `\W`: Matches any non-word character
- `\s`: Matches any whitespace character
- `\S`: Matches any non-whitespace character

## Special Characters

- `\`: Escapes a special character
- `()` : Defines a group
- `(?:...)`: Non-capturing group
- `(?=...)`: Positive lookahead assertion
- `(?!...)`: Negative lookahead assertion

## Examples

- `abc`: Matches the string "abc"
- `abc|def`: Matches "abc" or "def"
- `^abc`: Matches any string that starts with "abc"
- `abc$`: Matches a string that ends with "abc"
- `a.b`: Matches any string containing "a", any character, then "b"
- `a*`: Matches 0 or more 'a's
- `a+`: Matches 1 or more 'a's
- `a?`: Matches 0 or 1 'a'
- `\d{2,4}`: Matches between 2 and 4 digits

In [3]:
s = '''
<a class="nav-link" href="https://amazon.com/categories/ski">Ski</a>
<a class="product-link" href="https://amazon.com/p/1234567890/awesome-product-1">Coffee beans</a>
<a class="product-link" href="https://amazon.com/p/6454343333/ok-product-2">Backcountry Ski</a>
<a class="product-link" href="https://amazon.com/p/6543565454/great-product-1">Book</a>
<a class="footer-link" href="https://amazon.com/about-us">About Us</a>
'''

In [5]:
# Extract only the product links
# Expected output:
# https://amazon.com/p/1234567890/awesome-product-1
# https://amazon.com/p/6454343333/ok-product-2
# https://amazon.com/p/6543565454/great-product-1

# b'' is a byte string
# r'\n\t' is a raw string
# f'{variable1}' is a formatted string

pattern = r'<a class="product-link" href="https://amazon.com/p/1234567890/awesome-product-1">Coffee beans</a>'

re.findall(pattern, s)

['<a class="product-link" href="https://amazon.com/p/1234567890/awesome-product-1">Coffee beans</a>']

In [6]:
re.findall(r'\d', '123')

['1', '2', '3']

In [7]:
re.findall(r'\d\d\d', '123')

['123']

In [9]:
re.findall(r'\d{3}', 'dfgh123ghjkl')

['123']

In [13]:
re.findall(r'<a class="product-link" href="https://amazon.com/p/\d{10}/.+">', s)

['<a class="product-link" href="https://amazon.com/p/1234567890/awesome-product-1">',
 '<a class="product-link" href="https://amazon.com/p/6454343333/ok-product-2">',
 '<a class="product-link" href="https://amazon.com/p/6543565454/great-product-1">']

In [10]:
re.findall(r'<a href="(https://amazon.com/p/\d+/.+)">', s)

['https://amazon.com/p/1234567890/awesome-product-1',
 'https://amazon.com/p/6454343333/ok-product-2',
 'https://amazon.com/p/6543565454/great-product-1']

In [19]:
b = '''<a href="https://amazon.com/p/6543565454/great-product-1"> <img></img> </a>'''

re.findall(r'<a href="https://amazon.com/p/6543565454/.+?">', b)

['<a href="https://amazon.com/p/6543565454/great-product-1">']

# Practice Problems

### Problem 1: Email Extraction

**Problem**: Extract emails from a given string.  
**String**: "Contact us at support@example.com or sales@example.org"

["support@example.com", "sales@example.org"]

In [3]:
def extract_emails(text):
    # Define the regex pattern for extracting emails
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # Use re.findall to extract all email addresses matching the pattern
    emails = re.findall(email_pattern, text)
    
    return emails

input_string = "Contact us at support@example.com or sales@example.org"
result = extract_emails(input_string)
print(result)

['support@example.com', 'sales@example.org']


### Problem 2: Phone Number Validation

**Problem**: Validate and extract US phone numbers in the format xxx-xxx-xxxx.  
**String**: "My numbers are 123-456-7890 or 333-333-3333"

In [5]:
def extract_phone_numbers(text):
    # Define the regex pattern for validating US phone numbers
    phone_pattern = r'\b\d{3}-\d{3}-\d{4}\b'
    
    # Use re.findall to extract all phone numbers matching the pattern
    phone_numbers = re.findall(phone_pattern, text)
    
    return phone_numbers

input_string = "My numbers are 123-456-7890 or 333-333-3333"
result = extract_phone_numbers(input_string)
print(result)

['123-456-7890', '333-333-3333']


### Problem 3: Password Strength Check

**Problem**: Check if a password is at least 8 characters long, contains a digit, an uppercase, and a lowercase letter.  
**String**: "Password1"

In [6]:
def check_password_strength(password):
    # Check if the password is at least 8 characters long
    if len(password) < 8:
        return False

    # Check if the password contains at least one digit
    if not any(char.isdigit() for char in password):
        return False

    # Check if the password contains at least one uppercase letter
    if not any(char.isupper() for char in password):
        return False

    # Check if the password contains at least one lowercase letter
    if not any(char.islower() for char in password):
        return False

    # If all conditions are met, the password is considered strong
    return True

input_password = "Password1"
result = check_password_strength(input_password)

if result:
    print("Password is strong!")
else:
    print("Password is not strong. Please make sure it is at least 8 characters long and contains a digit, an uppercase, and a lowercase letter.")

Password is strong!


### Problem 4: Extracting Domain Name

**Problem**: Extract the domain name from an email address.  
**String**: "user@example.com"

In [7]:
def extract_domain_from_email(email):
    # Define the regex pattern for extracting the domain from an email address
    domain_pattern = r'@([A-Za-z0-9.-]+)'
    
    # Use re.search to find the domain in the email address
    match = re.search(domain_pattern, email)
    
    # Check if a match is found and return the domain
    if match:
        domain = match.group(1)
        return domain
    else:
        return None

input_email = "user@example.com"
result = extract_domain_from_email(input_email)

if result:
    print(f"The domain from the email '{input_email}' is: {result}")
else:
    print("Invalid email format.")

The domain from the email 'user@example.com' is: example.com


### Problem 5: Validating an IP Address

**Problem**: Check if a string is a valid IPv4 address.  
**String**: "192.168.1.1"

In [14]:
def is_valid_ipv4_string(ip_address):
    # Define the regex pattern for validating an IPv4 address
    ipv4_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')

    # Use re.match to check if the input string matches the pattern
    match = ipv4_pattern.match(ip_address)

    return bool(match)

input_ip = "192.168.1.1"
result = is_valid_ipv4_string(input_ip)

if result:
    print(f"The IP address '{input_ip}' is a valid IPv4 address.")
else:
    print(f"The IP address '{input_ip}' is not a valid IPv4 address.")

The IP address '192.168.1.1' is a valid IPv4 address.
