Click <a href='https://www.dataquest.io/blog/web-scraping-tutorial-python/'>here</a> to learn about Regular Expressions (RegEx) using Python.

In [None]:
########################
# DO NOT RUN THIS CELL #
########################

a, X, 9, < -- ordinary characters just match themselves exactly.
. (a period) -- matches any single character except newline '\n'
\w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
\W -- matches any non-word character.
\b -- matches word boundary (in between a word character and a non word character)
\s -- matches a single whitespace character -- space, newline, return, tab
\S -- matches any non-whitespace character.
\t, \n, \r -- tab, newline, return
\d -- matches any numeric digit [0-9]
\D matches any non-numeric character.
^ -- matches the beginning of the string, or specify omition of certain characters
$ -- matches the end of the string
\ -- escapes special character.
(x|y|z) matches exactly one of x, y or z.
(x) in general is a remembered group. We can get the value of what matched by using the groups() method of the object returned by re.search.
x? matches an optional x character (in other words, it matches an x zero or one times).
x* matches x zero or more times.
x+ matches x one or more times.
x{m,n} matches an x character at least m times, but not more than n times.
?: matches an expression but do not capture it. Non capturing group.
?= matches a suffix but exclude it from capture. Positive lookahead.
a(?=b) will match the "a" in "ab", but not the "a" in "ac"
In other words, a(?=b) matches the "a" which is followed by the string 'b', without consuming what follows the a.
?! matches if suffix is absent. Negative look ahead.
a(?!b) will match the "a" in "ac", but not the "a" in "ab"
?<= positive look behind
[] matches for groupings of consecutive characters
?<! negative look behind

########################
# DO NOT RUN THIS CELL #
########################

What are word boundaries?
--------------------------------------------------
Before the first character in the string, if the first character is a word character.<br>
After the last character in the string, if the last character is a word character.<br>
Between two characters in the string, where one is a word character and the other is not a word character<br>

In [None]:
import re

file = open("names.txt", encoding='utf-8')
data = file.read()
file.close()

In [None]:
# .match() - Checks for specific strings starting from the beginning of a string
re.match(r'Hawkins', data)

In [None]:
re.match(r'Patel', data)

In [None]:
re.match(r"Milliken", data)

In [None]:
# .search() - Looks for FIRST matching string anywhere in the searchable text string
re.search(r'Patel', data)

In [None]:
re.search(r'Butz', data)

In [None]:
re.search(r'\w, \w', data)

In [None]:
re.search(r'\w\w\w\w\w\w\w, \w\w\w\w\w', data)

In [None]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

In [None]:
re.search(r'\d'*4, data)

<strong>Exercise 1</strong>:<br>
Write a function that checks for n number of consecutive digits

In [None]:
def find_digits(n, searchable_text):
    return re.search(r'\d'*n, searchable_text)

find_digits(19, data)

In [None]:
# find(4, data) => <re.Match object; span=(XX, XX), match='5555'>

In [None]:
re.search(r'\(\d{3}\) \d{3}-\d{4}', data).pos

In [None]:
dir(re.match(r'\(\d{3}\) \d{3}-\d{4}', data))

In [None]:
# .findall() - Looks for matching string anywhere in the searchable text string and stores each instance into a list
re.findall(r'\(?\d{3}\)?\s?-?\d{3}-\d{4}', data)

In [None]:
re.findall(r'\(?\d{3}\)?[\s?-?]\d{3}-\d{4}', data)

In [None]:
re.findall(r'\w+, \w+', data)

In [None]:
!@#$%^&*()@gmail.com

In [None]:
re.findall(r'[-+.\w\d]+@[-.\w\d]+', data)

In [None]:
re.findall(r'[-+.\w\d]+@[codingtmple]+.com', data)

In [None]:
# re.VERBOSE/re.X - Allows multiline regular expressions
# re.IGNORECASE/re.I - Ignores casing
re.findall(r"""
    \b@[-.\w\d]* # word boundary, @ symbol, and any number of characters
    [^vog\t]+    # exclude all instances of 'gov' or tab
    \b           # word boundary
""", data, re.X|re.I)

In [None]:
re.findall(r'''
    \b[-\w]+, # last name
    \s        # one whitespace character
    [\w\- ]+   # 1 or more hyphens/word characters, spaces
    [^\t\n]   # omit tabs and newline characters
''', data, re.X|re.I)

In [None]:
info = re.findall(r'''
   ^([-\w]*,\s[-\w ]+)\t           # last name, first name
   ([-\w\d+.]+@[-\w\d+.]+)\t       # email
   (\(?\d{3}\)?-?\s?\d{3}-\d{4})\t # phone number
   ([\w\s\d]*,\s[\w\d\s,]*)\t      # occupation and company
   (@[\w\d]+)?$                    # Twitter handle
''', data, re.X|re.I|re.M)

info

In [None]:
# [
#     (First and last name,
#      email, 
#      phone,
#      title,
#      Twitter handle)
# ]


In [None]:
info = re.compile(r'''
    ^(?P<name>[-\w]*,\s[-\w ]+)\t             # last name, first name
   (?P<email>[-\w\d+.]+@[-\w\d+.]+)\t         # email
   (?P<phone>\(?\d{3}\)?-?\s?\d{3}-\d{4})\t   # phone number
   (?P<job>[\w\s\d]*,\s[\w\d\s,]*)\t          # occupation and company
   (?P<twitter>@[\w\d]+)?$                    # Twitter handle
''', re.X|re.I|re.M)

In [None]:
for i in info.finditer(data):
    print(f"Name: {i.group('name')}\nEmail: {i.group('email')}\nPhone: {i.group('phone')}\nJob: {i.group('job')}\nTwitter: {i.group('twitter')}\n\n")

##### In-class exercise 1: 

Use a regular expression to find every number in the given string

In [None]:
# output should be ['10', '1', '2']
my_string = "This string has 10 numbers, but it is only 1 string. I hope you solve this 2dy."
#Hint: Check out the last cell
#Output should be: ['10','1','2']
def getNums(my_string): 
    nums = re.findall(r'[0-9]+', my_string) 
    return nums
nums = getNums(my_string) 
print(nums)

##### In-class Exercise 2:

Write a function using regular expressions to find the domain name in the given email addresses (and return None for the invalid email addresses)<br><b>HINT: Use '|' for either or</b>

In [None]:
my_emails = ["jordanw@codingtemple.orgcom", "pocohontas1776@gmail.com", "helloworld@aol..com", "yourfavoriteband@g6.org", "@codingtemple.com"]

### In-Class Exercise #3 <br>
<p>Print each persons name and twitter handle, using groups, should look like:</p>
<p>==============<br>
   Full Name / Twitter<br>
   ==============</p>
<p>Derek Hawkins / @derekhawkins<br>
Norrbotten Governor / @sverik<br>
Ryan Butz / @ryanbutz</p>
<p>etc.</p>

In [5]:
name12 = re.findall(r'''
    ^([-\w],\s[-\w ])
    (@[\w\d]+)?$ 
''',  data, re.X|re.I|re.M)

name12

name12 = re.compile(r'''
    ^(?P<name>[-\w]*,\s[-\w ]+)\t             
   (?P<twitter>@[\w\d]+)?$  
''', re.X|re.I|re.M)



name12

name1 = re.findall(r'''
 ^([-\w],\s[-\w ])
  (@[\w\d]+)?$ 
''', data, re.X|re.I|re.M)


for i in name12.finditer(data):
      print(f"{i.group('name')}/{i.group('twitter')}")