# Lab 2: Regular Expressions

In [1]:
import re

### Problem 1: Extract Email Addresses
**Description:** Extract all email addresses from a given text.

**Sample text:** `Contact us at support@example.com or sales@company.org for assistance.
 For personal inquiries, email john.doe123@university.edu.`

 **Expected Output:**
 `support@example.com
 sales@company.org
 john.doe123@university.edu`

In [2]:
text1 =  (
    'Contact us at support@example.com or '
    'sales@company.org for assistance.'
    'For personal inquiries, email john.doe123@university.edu.'
)

pattern1 = r"[\w.-]+@\w+\.[A-Za-z]{2,}"

emails = re.findall(pattern1, text1)
emails

['support@example.com', 'sales@company.org', 'john.doe123@university.edu']


### Problem 2: Validate Phone Numbers
**Description:** Check if the phone numbers in the text follow the format `XXX-XXX-XXXX.`

**Example**
 - Valid: 123-456-7890, 987-654-3210
 - Invalid: 12-345-67890, 1234567890, 123-45-6789


In [13]:
phone_numbers = [
    "123-456-7890",
    "987-654-3210",
    "00-345-67890",
    "1234567890",
    "123-45-6789"
]
pattern = re.compile(r"^\d{3}-\d{3}-\d{4}$")
valid = []
invalid = []

#re.fullmatch(r"\d{3}-\d{3}-\d{4}", n):

for n in phone_numbers:
    if pattern.match(n):
        valid.append(n)
    else:
        invalid.append(n)

print('problem 2, phone numbers')
print('valid:', valid)
print('invalid:', invalid)

problem 2, phone numbers
valid: ['123-456-7890', '987-654-3210']
invalid: ['00-345-67890', '1234567890', '123-45-6789']


### Problem 3: Extract Dates
**Description:** Extract all dates in the format `DD/MM/YYYY` or `DD-MM-YYYY` from a given text.

**Sample text:**  Important dates: 25/12/2023, 01-01-2024, 31/05/2023, and 15-10-2024.

**Expected Output:**
 25/12/2023
 01-01-2024
 31/05/2023
 15-10-2024

In [17]:
dates = 'Important dates: 25/00/2023, 01-01-2024, 31/05/2023, and 15-10-2024'

pattern1 = r"(\d{2}-\d{2}-\d{4})|(\d{2}/\d{2}/\d{4})"
pattern2 = r"(?:\d{2}-\d{2}-\d{4})|(?:\d{2}/\d{2}/\d{4})"
result1 = re.findall(pattern1, dates)
result2 = re.findall(pattern2, dates)

print('version1 \n', result1)
print('version2 \n', result2)

version1 
 [('', '25/00/2023'), ('01-01-2024', ''), ('', '31/05/2023'), ('15-10-2024', '')]
version2 
 ['25/00/2023', '01-01-2024', '31/05/2023', '15-10-2024']


### Problem 4: Find Repeated Words
**Description:** Identify and extract words that are repeated consecutively in a given text.

**Sample text:** The the quick brown fox jumps over the the lazy dog.

**Expected Output:**
 Repeated words:
 the the
 the the

In [20]:
text4 = "This is is a test. The the quick brown fox jumps over over the lazy dog dog."

pattern4 = r"\b(\w+)\s+\1\b"   # \1 refers back
repeated_words = re.findall(pattern4, text4, flags=re.IGNORECASE)
repeated_words

['is', 'The', 'over', 'dog']

### Problem 5: Extract Hashtags
**Description:** Extract all hashtags from a given text.

**Sample text:** Check out our new products: #Sale2024, #NewArrival, and #Discounts!

**Expected Output:**
#Sale2024
#NewArrival
#Discounts

In [22]:
text5 =  'Check out our new products: #Sale2024, #NewArrival, and #Discounts!'
pattern5 = r'#\w+'
result = re.findall(pattern5, text5)
print('problem 5, hashtags \n', result)

problem 5, hashtags 
 ['#Sale2024', '#NewArrival', '#Discounts']


### Problem 6: Validate Password Strength
**Description:** Check if the passwords meet the following criteria:
- At least 8 characters long
- Contains at least one uppercase letter
- Contains at least one lowercase letter
- Contains at least one digit

**Example:**

Valid: Password123, Secure456

Invalid: weak, password, Password

In [29]:
passwords = [ 'Password123', 'Secure456', 'weak', 'password', 'Password']
pattern6 = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$"
valid_passwords = [pw for pw in passwords if re.match(pattern6, pw)]
print('valid passwords \n', valid_passwords)

valid passwords 
 ['Password123', 'Secure456']


### Problem 7: Extract URLs
**Description:** Extract all URLs from the given text.

**Sample Text:**
`Visit our website at https://www.example.com or check out http://blog.example.org for updates.`

**Expected Output:**
`https://www.example.com`
`http://blog.example.org`

In [35]:
text_url = 'Visit our website at https: // www.example.com or check out http: // blog.example.org for updates.'
pattern7 = r"(https?://[^\s]+|www\.[^\s]+)"
urls = re.findall(pattern7, text_url)
print(urls)

['www.example.com']


### Problem 8: Replace Multiple Spaces with a Single Space

**Description:** Replace multiple spaces in the text with a single space.

**Sample Text:**
`This   text       has   multiple    spaces.`

**Expected Output:**
This text has multiple spaces

In [45]:
text_spaces = "This   text    has  multiple     spaces."
pattern8 = r"\s{2,}"
single_space_text = re.sub(pattern8, " ", text_spaces)
print( single_space_text)

This text has multiple spaces.


### Problem 9: Extract Quoted Text

**Description:** Extract all text within double quotes ("..." ).

**Sample Text:**
He said, "Hello, world!" and she replied, "Hi there!"

**Expected Output:**
Hello, world!
Hi there!

In [52]:
text_quotes = 'He said, "Hello, world!" and she replied, "Hi there!" '
pattern9 = r'"(.*?)"'
quoted = re.findall(pattern9, text_quotes)
print(quoted)

['Hello, world!', 'Hi there!']


### Problem 10: Validate IP Addresses

**Description:** Check if the IP addresses in the text are valid (format: XXX.XXX.XXX.XXX ).

**Example:**
- Valid: 192.168.1.1, 10.0.0.255
- Invalid: 256.1.2.3, 192.168.01.1, 192.168.1

In [76]:
text_ips = "Valid: 192.168.1.1 and 10.0.0.255; Invalid: 256.1.2.3 or 192.168.01.1 or 192.168.1"
pattern = r'\b(?:\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])' \
          r'(?:\.(?:\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])){3}\b'

valid_ips = re.findall(pattern, text_ips)
print('problem 10, valid IP addresses \n', valid_ips)

problem 10, valid IP addresses 
 ['192.168.1.1', '10.0.0.255']
