# Regex

In [1]:
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email = 'anna.jones@gmail.com'

In [2]:
import re  # Python Library used for regex operations
if re.fullmatch(pattern, email):
    print('It is a match')
else:
    print('Invalid email id. Try again')

It is a match


In [3]:
def validate_email_address(test_your_email):
    regex_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    if re.fullmatch(regex_pattern, test_your_email):
        print('It is a match')
    else:
        print('Invalid email id. Try again')

In [4]:
validate_email_address('anna.jones@gmail.com')
validate_email_address('anna.jonesgml.com')  # Doesn't contain @
validate_email_address('anna_jones1991@gmail.com')
validate_email_address('anna%20jones@gmail.com')
validate_email_address('anna%20jones@gma%il.com') # After @, before dot (.) you can only have A-Z, a-z, 0-9, . or _

It is a match
Invalid email id. Try again
It is a match
It is a match
Invalid email id. Try again


**Regex Tutorial**

1. https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial
2. https://www.w3schools.com/python/python_regex.asp

**Regex Checker**

You can check your regex expression is working or not on texts or strings on these websites.

1. https://pythex.org/
2. https://regex101.com/
3. http://www.pyregex.com/

## re.search()

In [5]:
texts = ['1,508 new cases confirmed in Ireland',
        '1,712 new cases confirmed in Ireland',
        '678 new cases confirmed in Ireland',
        '20 new cases confirmed in Ireland',
        'There are 200 new cases confirmed in Ireland'
       ]

In [6]:
pattern = r'.*new'
re.search(pattern, texts[0])

<re.Match object; span=(0, 9), match='1,508 new'>

In [7]:
re_obj = re.search(pattern, texts[0])
re_obj.start(), re_obj.end(), re_obj.span(), re_obj.group()

(0, 9, (0, 9), '1,508 new')

In [8]:
list_numbers = [re.search(pattern, text).group() for text in texts]
list_numbers

['1,508 new', '1,712 new', '678 new', '20 new', 'There are 200 new']

In the last list element, we get 'There are' because we have .* before new.

## re.findall()

In [9]:
pattern = r'Ireland'
[re.findall(pattern, text) for text in texts]

[['Ireland'], ['Ireland'], ['Ireland'], ['Ireland'], ['Ireland']]

In [10]:
pattern = r'\d'
[re.findall(pattern, text) for text in texts]

[['1', '5', '0', '8'],
 ['1', '7', '1', '2'],
 ['6', '7', '8'],
 ['2', '0'],
 ['2', '0', '0']]

In [11]:
pattern = r'[0-9]'
[re.findall(pattern, text) for text in texts]

[['1', '5', '0', '8'],
 ['1', '7', '1', '2'],
 ['6', '7', '8'],
 ['2', '0'],
 ['2', '0', '0']]

**Note the difference -**

In [12]:
pattern = r'\d+'  # + - one or more repetitions
text = 'There were 60 students in 5 classes'
re.findall(pattern, text)

['60', '5']

In [13]:
pattern = r'\d'
text = 'There were 60 students in 5 classes'
re.findall(pattern, text)

['6', '0', '5']

**But this also doesn't work for all instances**

In [14]:
pattern = r'\d+'
[re.findall(pattern, text) for text in texts]

[['1', '508'], ['1', '712'], ['678'], ['20'], ['200']]

## re.split()

In [15]:
space_sep_tokens = [re.split("\s", text) for text in texts]
space_sep_tokens

[['1,508', 'new', 'cases', 'confirmed', 'in', 'Ireland'],
 ['1,712', 'new', 'cases', 'confirmed', 'in', 'Ireland'],
 ['678', 'new', 'cases', 'confirmed', 'in', 'Ireland'],
 ['20', 'new', 'cases', 'confirmed', 'in', 'Ireland'],
 ['There', 'are', '200', 'new', 'cases', 'confirmed', 'in', 'Ireland']]

## re.sub()

Hint: sub for substitute

In [16]:
space_sep_tokens = [re.sub("\d+,?\d+", "X", text) for text in texts]
space_sep_tokens

['X new cases confirmed in Ireland',
 'X new cases confirmed in Ireland',
 'X new cases confirmed in Ireland',
 'X new cases confirmed in Ireland',
 'There are X new cases confirmed in Ireland']

In [17]:
space_sep_tokens = [re.sub("\d+,\d+", "More than 1000", text) for text in texts]
space_sep_tokens

['More than 1000 new cases confirmed in Ireland',
 'More than 1000 new cases confirmed in Ireland',
 '678 new cases confirmed in Ireland',
 '20 new cases confirmed in Ireland',
 'There are 200 new cases confirmed in Ireland']

# Other Examples

In [18]:
texts = '''1,508 new cases confirmed in Ireland\n712 new cases confirmed in Ireland\n678 new cases confirmed in Ireland\n20 new cases confirmed in Ireland\nThere are 200 new cases confirmed in Ireland'''

In [19]:
re.findall(r'.*\n', texts)

['1,508 new cases confirmed in Ireland\n',
 '712 new cases confirmed in Ireland\n',
 '678 new cases confirmed in Ireland\n',
 '20 new cases confirmed in Ireland\n']

In [20]:
re.split('\n', texts)

['1,508 new cases confirmed in Ireland',
 '712 new cases confirmed in Ireland',
 '678 new cases confirmed in Ireland',
 '20 new cases confirmed in Ireland',
 'There are 200 new cases confirmed in Ireland']

In [21]:
text = 'Please contact us at: contact@abc.com'
match = re.search(r'([\w\.-]+)@([\w\.-]+)', text)

print("Email address:", match.group(0))  # The whole matched text
print("Username:", match.group(1))  # The username (group 1)
print("Host:", match.group(2))  # The host (group 2)

Email address: contact@abc.com
Username: contact
Host: abc.com


In [22]:
match.string

'Please contact us at: contact@abc.com'

In [23]:
for match in re.finditer(r'([\w\.-]+)@([\w\.-]+)', text):
    print(match.span())

(22, 37)
