# Exercise: Regular Expressions

In [2]:
import re

In [None]:
#1 Question: Match all email addresses (SOLVED)

inputs = ["My email is john@email.com and mary@otherplace.net",
          "Visit us at support@ourcompany.co.uk for help",
          "No emails here"]

pattern = r"(\w+)@(\w+)\.(\w+)"

matches = []
for input in inputs:
    matches += re.findall(pattern, input)

for m in matches:
    username = m[0]
    mailserver = m[1]
    domain = m[2]
    print(f"Username: {username}")
    print(f"Mailserver: {mailserver}")
    print(f"Domain: {domain}")
    print('')

In [None]:
#2 Question: Extract domain from email

inputs = ["john@email.com",
          "mary+newsletter@gmail.com",
          "support@ourcompany.co.uk"]

# your code here ...:

pattern = r"@(\w+\.\w+)"

for email_address in inputs:
    match = re.search(pattern, email_address)
    if match:
        domain = match.group(1)
        print(f"The domain of the email '{email_address}' is: {domain}")
    else:
        print(f"No domain found in the email '{email_address}'.")

In [6]:
#3 Question: Validate phone number

inputs = ["555-123-4567",
          "1 (234) 567-8910",
          "notaphonenumber"]

# your code here ...:
import re

phone_pattern = re.compile(r"^\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}$")

for phone_number in inputs:
    if phone_pattern.match(phone_number):
        print(f"The phone number '{phone_number}' is valid.")
    else:
        print(f"The input '{phone_number}' is not a valid phone number.")


The phone number '555-123-4567' is valid.
The input '1 (234) 567-8910' is not a valid phone number.
The input 'notaphonenumber' is not a valid phone number.


In [8]:
#4 Question: Extract area code

inputs = ["(555) 123-4567",
          "1 (234) 567-8910",
          "5551234567"]

# your code here ...:


area_code_pattern = re.compile(r"\((\d{3})\)|^1?\s?\((\d{3})\)|^1?\s?(\d{3})")

for phone_number in inputs:
    match = area_code_pattern.search(phone_number)
    if match:
        area_code = next(code for code in match.groups() if code is not None)
        print(f"The area code in '{phone_number}' is: {area_code}")
    else:
        print(f"No area code found in '{phone_number}'.")

The area code in '(555) 123-4567' is: 555
The area code in '1 (234) 567-8910' is: 234
The area code in '5551234567' is: 555


In [9]:
#5 Question: Match URLs and extract host

inputs = ["Visit https://www.example.com for more info",
          "Our website is example.com",
          "No URLs here"]

# your code here ...:
url_pattern = re.compile(r'https?://([a-zA-Z0-9.-]+)')

for text in inputs:
    matches = url_pattern.findall(text)
    if matches:
        print(f"In '{text}':")
        for host in matches:
            print(f"  Host: {host}")
    else:
        print(f"No URLs found in '{text}'.")

In 'Visit https://www.example.com for more info':
  Host: www.example.com
No URLs found in 'Our website is example.com'.
No URLs found in 'No URLs here'.


In [10]:
#6 Question: Remove non-alphabetic characters

inputs = ["Hello world!",
          "123 Main St.",
          "greetings&more"]

# your code here ...:

clean_pattern = re.compile(r'[^a-zA-Z\s]')

for text in inputs:
    cleaned_text = clean_pattern.sub('', text)
    print(f"Original: '{text}', Cleaned: '{cleaned_text}'")


Original: 'Hello world!', Cleaned: 'Hello world'
Original: '123 Main St.', Cleaned: ' Main St'
Original: 'greetings&more', Cleaned: 'greetingsmore'


In [11]:
#7 Question: Find words containing "tion"

inputs = ["This is a test sentence with the word station in it.",
          "No words containing tion here",
          "motion activation vacation"]

# your code here ...:

tion_pattern = re.compile(r'\b\w*tion\w*\b')

for text in inputs:
    matches = tion_pattern.findall(text)
    if matches:
        print(f"In '{text}':")
        for word in matches:
            print(f"  Word: {word}")
    else:
        print(f"No words containing 'tion' found in '{text}'.")


In 'This is a test sentence with the word station in it.':
  Word: station
In 'No words containing tion here':
  Word: tion
In 'motion activation vacation':
  Word: motion
  Word: activation
  Word: vacation


In [12]:
#8 Question: Replace all occurrences of "hello" with "goodbye"

inputs = ["hello world",
          "hello there",
          "no match"]

 # your code here ...:


hello_pattern = re.compile(r'\bhello\b')

for text in inputs:
    replaced_text = hello_pattern.sub('goodbye', text)
    print(f"Original: '{text}', Replaced: '{replaced_text}'")


Original: 'hello world', Replaced: 'goodbye world'
Original: 'hello there', Replaced: 'goodbye there'
Original: 'no match', Replaced: 'no match'


In [13]:
#9 Question: Extract date strings in ISO8601 format

inputs = ["Log from 2023-01-15",
          "Meeting on 2023-02-01T13:00:00Z",
          "No dates"]

 # your code here ...:

iso8601_pattern = re.compile(r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}Z)?\b')

for text in inputs:
    matches = iso8601_pattern.findall(text)
    if matches:
        print(f"In '{text}':")
        for date_string in matches:
            print(f"  Date: {date_string}")
    else:
        print(f"No ISO8601 dates found in '{text}'.")


In 'Log from 2023-01-15':
  Date: 2023-01-15
In 'Meeting on 2023-02-01T13:00:00Z':
  Date: 2023-02-01T13:00:00Z
No ISO8601 dates found in 'No dates'.


In [14]:
#10 Question: Validate correctly formatted date

inputs = ["2023-01-15",
          "02/01/2023",
          "invalid date"]

 # your code here ...:
date_pattern = re.compile(r'\b\d{4}-\d{2}-\d{2}\b')

for date_string in inputs:
    match = date_pattern.fullmatch(date_string)
    if match:
        print(f"The date '{date_string}' is correctly formatted.")
    else:
        print(f"The input '{date_string}' is not a correctly formatted date.")



The date '2023-01-15' is correctly formatted.
The input '02/01/2023' is not a correctly formatted date.
The input 'invalid date' is not a correctly formatted date.


In [15]:
#11 Question: Remove punctuation except hyphens

inputs = ["Hello! World?",
          "123-Main_St.",
          "Hi there."]

 # your code here ...:


cleaned_pattern = re.compile(r'[^a-zA-Z0-9\s-]')

for text in inputs:
    cleaned_text = cleaned_pattern.sub('', text)
    print(f"Original: '{text}', Cleaned: '{cleaned_text}'")


Original: 'Hello! World?', Cleaned: 'Hello World'
Original: '123-Main_St.', Cleaned: '123-MainSt'
Original: 'Hi there.', Cleaned: 'Hi there'


In [16]:
#12 Question: Count occurrences of a word

inputs = ["Hello world. Hello!",
          "Hello hello world",
          "no match"]

# your code here ...:

word_to_count = "hello"
word_count_pattern = re.compile(r'\b' + re.escape(word_to_count) + r'\b', flags=re.IGNORECASE)

for text in inputs:
    count = len(word_count_pattern.findall(text))
    print(f"The word '{word_to_count}' occurs {count} times in '{text}'.")


The word 'hello' occurs 2 times in 'Hello world. Hello!'.
The word 'hello' occurs 2 times in 'Hello hello world'.
The word 'hello' occurs 0 times in 'no match'.


In [17]:
#13 Question: Extract IP addresses from log

inputs = ["127.0.0.1 - GET /",
          "User logged in from 192.168.1.1",
          "No IPs"]

# your code here ...:

ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')

for log_entry in inputs:
    ips = ip_pattern.findall(log_entry)
    if ips:
        print(f"In '{log_entry}':")
        for ip in ips:
            print(f"  IP: {ip}")
    else:
        print(f"No IP addresses found in '{log_entry}'.")


In '127.0.0.1 - GET /':
  IP: 127.0.0.1
In 'User logged in from 192.168.1.1':
  IP: 192.168.1.1
No IP addresses found in 'No IPs'.


In [18]:
#14 Question: Redact credit card and SSN numbers

inputs = ["Visa: 4111-1111-1111-1111",
          "My SSN is 111-11-1111",
          "No numbers"]

# your code here ...:

credit_card_pattern = re.compile(r'\b(?:\d[ -]*?){15,16}\d\b')

ssn_pattern = re.compile(r'\b\d{3}-\d{2}-\d{4}\b')

for text in inputs:
    redacted_text = credit_card_pattern.sub('[Credit Card]', text)
    redacted_text = ssn_pattern.sub('[SSN]', redacted_text)
    print(f"Original: '{text}', Redacted: '{redacted_text}'")


Original: 'Visa: 4111-1111-1111-1111', Redacted: 'Visa: [Credit Card]'
Original: 'My SSN is 111-11-1111', Redacted: 'My SSN is [SSN]'
Original: 'No numbers', Redacted: 'No numbers'
