In [3]:
from IPython.display import Image
import pandas as pd
import re

In [2]:
df = pd.read_csv('.\data\emails.csv')

In [11]:
# Literal text

text = 'cats are cute silly and cute cats'

pattern = 'cat'



In [12]:
re.findall(pattern, text)

['cat', 'cat']

In [19]:
text = "I have a cat and a dog and caterpillar"

pattern = r"\b(cat|dog)\w*"

In [20]:
re.findall(pattern, text)

['cat', 'dog', 'cat']

## Anchors (starts/ends)

In [21]:
word = "hello world"
pattern = "^hello"

# ^ indicates that it starts with hello

In [22]:
re.findall(pattern, word)

['hello']

In [25]:
word = "welcome hello world"
pattern = "world$"

# $ indicates that it ends with world

In [26]:
re.findall(pattern, word)

['world']

## Meta Sequences

In [27]:
text = "my course id for thsi course is 12345"
pattern = "\d"

# |d allows you to look for digits

In [28]:
re.findall(pattern, text)

['1', '2', '3', '4', '5']

In [29]:
text = "my course id for thsi course is 12345"
pattern = "\d\d\d\d\d"

# |d allows you to look for multiple digits

In [30]:
re.findall(pattern, text)

['12345']

In [33]:
text = "my course id for thsi course is 12345"
pattern = "\d+"

# |d allows you to search for multiple digits (greedy symbol!)

In [32]:
re.findall(pattern, text)

['12345']

## Quantifiers

In [36]:
text = "my telephone number is 09884457854 and my course id is 1234"
pattern = "\d{11}"

# |d allows you to search for a specific number of multiple digits

In [37]:
re.findall(pattern, text)

['09884457854']

In [38]:
re.search(pattern, text)

<re.Match object; span=(23, 34), match='09884457854'>

In [40]:
re.match(pattern, text)

In [47]:
pattern1 = "\w+"
text1 = "Course code if f456 #nlpgdp#"

# Displays all of the alphanumeric characters

In [48]:
re.findall(pattern1, text1)

['Course', 'code', 'if', 'f456', 'nlpgdp']

In [49]:
text = "my telephone number is 09884457854 and my course id is 1234"
pattern = "\d{4,8}"

re.findall(pattern, text)

# Returns between the range of number length specified

['09884457', '1234']

In [52]:
text = "my Telephone number is 09884457854 and My course id is 1234"
pattern = "[a-zA-Z]+"

re.findall(pattern, text)

# Find the text components

['my', 'Telephone', 'number', 'is', 'and', 'My', 'course', 'id', 'is']

In [60]:
text = "my Telephone number is 09884457854 and 0987-8974-897 and my course id is 1234"
pattern = "\d{11}|\d{4}-\d{4}-\d{3}"

re.findall(pattern, text)

# Capture telephone numbers in different formats

['09884457854', '0987-8974-897']

In [61]:
text = "my Telephone number is 09884457854 and 0987-8974-897 and my course id is 1234"
pattern = "[0-9-]{11,}"

re.findall(pattern, text)

# Capture telephone numbers in different formats

['09884457854', '0987-8974-897']

## Activity

In [67]:
# Want to validate that emails are in the format: firstname.lastname@ons.gov.uk

pattern_fn = "[a-zA-Z]+"

In [69]:
re.match(pattern_fn, 'pragya')

<re.Match object; span=(0, 6), match='pragya'>

In [71]:
pattern_fn_dot = "[a-zA-Z]+\."

re.match(pattern_fn_dot, 'sean.joyce')

<re.Match object; span=(0, 5), match='sean.'>

In [74]:
pattern_fn_dot_ln = "[a-zA-Z]+\.[a-zA-Z]+[0-9]*"

re.match(pattern_fn_dot_ln, 'sean.joyce')

<re.Match object; span=(0, 10), match='sean.joyce'>

In [86]:
pattern_fn_dot_ln_at = "[a-zA-Z]+\.[a-zA-Z]+[0-9]*\S+@\w+\.gov\.uk"

re.match(pattern_fn_dot_ln_at, 'sean.joyce@ons.gov.uk')

<re.Match object; span=(0, 21), match='sean.joyce@ons.gov.uk'>

In [88]:
emails = ['prag.grt@ons.gov.uk', 'sean.joyce@os.gov.uk', 'seanjoyce@ons.gov.uk']

for email in emails:
    if re.match(pattern_fn_dot_ln_at, email):
        print(email, "is valid")
    else:
        print(email, "is not valid")

prag.grt@ons.gov.uk is valid
sean.joyce@os.gov.uk is valid
seanjoyce@ons.gov.uk is not valid


In [107]:
# Password manager that identifies whether a password is 8 digits (numbers long and also contains 1 special character)


import re

# Test cases
passwords = [
    "1234567!",    # Valid: 7 digits + 1 special character
    "12345678!",   # Invalid: 8 digits + 1 special character (total 9 characters)
    "abcd1234!",   # Invalid: contains non-digit characters
    "12345678",    # Invalid: no special character
    "12345!78",    # Valid: 8 digits + 1 special character
    "1!234567",    # Valid: 8 digits + 1 special character
    "a1!23456",    # Invalid: contains non-digit characters
    "123456!8"     # Valid: 8 digits + 1 special character
]

pattern = re.compile(r'^(?=.*[!@#$%^&*()_\-+=<>?])[0-9!@#$%^&*()_\-+=<>?]{8}$')

for password in passwords:
    if pattern.match(password):
        print(f"'{password}' is Valid")
    else:
        print(f"'{password}' is Invalid")




'1234567!' is Valid
'12345678!' is Invalid
'abcd1234!' is Invalid
'12345678' is Invalid
'12345!78' is Valid
'1!234567' is Valid
'a1!23456' is Invalid
'123456!8' is Valid


In [108]:
# Password manager that identifies whether a password is 8 digits/letters long and also contains 1 special character)

import re

passwords = [
    "1234567!",    # Valid: 7 digits + 1 special character
    "12345678!",   # Invalid: 8 digits + 1 special character (total 9 characters)
    "abcd1234!",   # Invalid: 8 letters/digits + 1 special character
    "12345678",    # Invalid: no special character
    "12345!78",    # Valid: 7 digits + 1 special character
    "1!234567",    # Valid: 7 digits + 1 special character
    "a1!23456",    # Valid: 6 digits/letters + 1 special character
    "123456!8"     # Valid: 7 digits + 1 special character
]

pattern = re.compile(r'^(?=.*[!@#$%^&*()_\-+=<>?])[a-zA-Z0-9!@#$%^&*()_\-+=<>?]{8}$')

for password in passwords:
    if pattern.match(password):
        print(f"'{password}' is Valid")
    else:
        print(f"'{password}' is Invalid")


'1234567!' is Valid
'12345678!' is Invalid
'abcd1234!' is Invalid
'12345678' is Invalid
'12345!78' is Valid
'1!234567' is Valid
'a1!23456' is Valid
'123456!8' is Valid
