# Regular Expressions

Regular expressions is a concept used to search for patterns in string text.

This is a univerisal concept for any programming language or text editing program. 

We're going to learn the concepts while we learn the syntax for python.

The goal of regular expressions is to be able to search for a specific type of text inside of a string.  If we have a form on our webpage where we ask for email addresses, can we check whether the inputted string actually follows the form of an email?  some letters or numbers or special characters, then an @ sign then some more letters numbers or special characters then a . then a few more letters

In [None]:
import re

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
123abc

Hello HelloHello

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

utexas.edu

321-555-4321
123.555.1234

daniel-mitchell@utexas.edu

Mr. Johnson
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

## Searching literals

In [None]:
pattern = re.compile(r'abc')

In [None]:
matches = pattern.finditer(text_to_search)

In [None]:
for mat in matches:
    print(mat)

In [None]:
print(text_to_search[69:72])

In [None]:
pattern = re.compile(r'cba')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

## Searching special characters

In [None]:
pattern = re.compile(r'.')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\d')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\D')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\d\w')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\d\s')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

## Word boundary

In [None]:
# Hello HelloHello
pattern = re.compile(r'Hello')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'Hello\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\bHello\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\BHello\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\b\d')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'^\s')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[123]\w')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[a-z][a-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[a-zA-Z0-9][a-zA-z-]')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[a-zA-Z][^a-zA-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

## Character groups

In [None]:
pattern = re.compile(r'(abc|edu|texas)\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'([A-Z]|llo)[a-zA-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

## Quantifiers

In [None]:
pattern = re.compile(r'Mr\.?\s[A-Z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'Mr\.?\s[A-Z][a-z]*')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'M(s|rs)\.?\s[A-Z][a-z]*')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[a-zA-Z0-9_]+\.[a-z]{3}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

## Accessing information in the Match object

In [None]:

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat.span(0))
    print(mat.group(0))
    print(text_to_search[mat.span(0)[0]:mat.span(0)[1]])
    
    


In [None]:
urls = r'''
https://www.google.com
http://yahoo.com
https://www.whitehouse.gov
https://craigslist.org
'''

In [None]:
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
matches = pattern.finditer(urls)
for mat in matches:
    print(mat)

In [None]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for mat in matches:
    print(mat.group(2)+mat.group(3))

In [None]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for mat in matches:
    print(mat.group(0))
    print(urls[mat.span(2)[0]:mat.span(2)[1]]+urls[mat.span(3)[0]:mat.span(3)[1]])

## Regular Expression Practice Exercise

#### Import the random email data file

In [None]:
import pandas as pd

email_data = pd.read_csv('Random Email Dataset.csv')

#### Display the Email ids

In [None]:
email_data['Email Address']

#### FInd the number of gamail email Ids (ending with @gamail.com)

In [None]:
import re

x = email_data['Email Address']

# to be able to use finditer, we need to pass a string. We use the join function to achieve that.
#Example of join Functions

print('||'.join(x))
#here the column Email Address has been converted to a string where each email id is searated by a pipe

In [None]:
#now let us use the joijn to find out the number of email ids with gamail
pattern1 = re.compile(r'[a-zA-Z0-9_]@gamail\.com\b') #using a space after each address in the pattern
matches = pattern1.finditer(' '.join(x)) #using a space after each address as in the pattern which matches with our string
counter = 0
for mat in matches:
    counter = counter+1
print('Number of gamail email ids:', counter)

#### Find the number of yahooo email Ids (ending with @yahooo.com)

In [None]:
# We will follow the same approach.
pattern2 = re.compile(r'[a-zA-Z0-9_]@yahooo\.com\b') #using a space after each address in the pattern
matches = pattern2.finditer(' '.join(x)) #using a space after each address as in the pattern which matches with our string
counter = 0
for mat in matches:
    counter = counter+1
print('Number of yahooo email ids:', counter)


#### Find the number of entries that are not email ids (consider the entries that do not have a @ and a .com/.in/.org in them)

In [None]:
pattern3 = re.compile(r'[a-zA-Z0-9_]+@[a-zA-Z0-9_]+\.com\b')
matches = pattern3.finditer(' '.join(x)) #using a space after each address as in the pattern which matches with our string
counter = 0
for mat in matches:
    counter = counter + 1
    #print(mat)
email_ids = counter

print('Number of email ids:', email_ids)

# let us find the total number of non-email data entries
total_entries = len(email_data['Email Address'])
print('Total Number of non email entries:',total_entries-email_ids)

#### find the total entries that have the pattern 'asd' in them

In [None]:
pattern4 = re.compile(r'asd')
matches = pattern4.finditer(' '.join(x))
counter = 0
for mat in matches:
    counter = counter + 1
    #print(mat)

print('Number of such patterns:', counter)

#### find the number of email Ids that start with k

In [None]:
pattern5 = re.compile(r'\b[k][a-zA-Z0-9_]*@[a-zA-Z0-9_]*\.[a-z]{2,4}\b')
matches = pattern5.finditer(' '.join(x))
counter = 0
for mat in matches:
    counter = counter + 1
    #print(mat)

print('Number of such email Ids:', counter)