**Contents:**
* [Finding Patterns in Text](#0)
* [Preprocessing using Regex](#1)
* [Intro to re Module](#2)
* [Validating Phone Numbers with Python](#3)
* [Parsing URLs](#4)
* [Parsing Binary](#5)
* [Symbolic Group Names](#6)
* [Date Parsing](#7)
* [REGEX Compilation Flags](#8)
* [REGEX Substitution Basics](#9)
* [REGEX Profanity Filter](#10)
* [Swapping File Names](#11)
* [](#12)

<a id='0'></a>
## Finding Patterns in Text

https://pythex.org/

In [134]:
import re

In [135]:
sentence = "I was born in the year 1996"
re.match(r".*", sentence) #.=any character; *=0 or more

<_sre.SRE_Match object; span=(0, 27), match='I was born in the year 1996'>

In [136]:
sentence =""
re.match(r".+", sentence) #.=any character; +=1 or more; return nothing

In [137]:
sentence = "I was born in the year 1996"
re.match(r"[a-zA-Z]+", sentence) #looks at the first match in the sentence

<_sre.SRE_Match object; span=(0, 1), match='I'>

In [138]:
sentence = "a"
re.match(r"ab?",sentence) #match a, then 0 or exactly 1 "b"

<_sre.SRE_Match object; span=(0, 1), match='a'>

In [139]:
sentence = "ab"
re.match(r"ab?",sentence) #match a, then 0 or exactly 1 "b"

<_sre.SRE_Match object; span=(0, 2), match='ab'>

In [140]:
sentence = "abb"
re.match(r"ab?",sentence) #match a, then 0 or exactly 1 "b"

<_sre.SRE_Match object; span=(0, 2), match='ab'>

In [141]:
sentence = "b"
re.match(r"ab?",sentence) #match a, then 0 or exactly 1 "b", no response cause looking for a or ab, not b

In [142]:
sentence = "1996 was the year when I was born"
re.match(r"[a-zA-Z]+", sentence) #local search, no response because sentence starts with 1996, not text

In [143]:
sentence = "1996 was the year when I was born"
re.search(r"[a-zA-Z]+", sentence) #global search

<_sre.SRE_Match object; span=(5, 8), match='was'>

In [144]:
#Starts with (^)
sentence = "1996 was the year when I was born"

if re.match(r"^1996", sentence):
	print("Match")
else:
	print("No match")

Match


In [145]:
sentence = "1996 was the year when I was born"
if re.match(r"^The", sentence):
	print("Match")
else:
	print("No match")

No match


In [146]:
sentence = "1996 was the year when I was born"
#Ends with ($) search, not match
if re.search(r"born$", sentence):
	print("Match")
else:
	print("No match")

Match


In [147]:
sentence = "I love Avengers"
print(re.sub(r"Avengers", "Justice League", sentence))

I love Justice League


In [148]:
sentence = "I love Avengers Avengers"
print(re.sub(r"Avengers", "Justice League", sentence))

I love Justice League Justice League


In [149]:
sentence = "I love Avengers Avengers"
print(re.sub(r"[a-z]", "0", sentence))
print(re.sub(r"[a-zA-Z]", "0", sentence))	
print(re.sub(r"[a-zA-Z]", "0", sentence, flags=re.I)) #case insensitive
print(re.sub(r"[a-zA-Z]", "0", sentence, 1, flags=re.I)) #only one occurence
print(re.sub(r"[a-zA-Z]", "0", sentence, 5, flags=re.I)) #only one occurence

I 0000 A0000000 A0000000
0 0000 00000000 00000000
0 0000 00000000 00000000
0 love Avengers Avengers
0 0000 Avengers Avengers


In [150]:
sentence1 = "Welcome to the year 2018"
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence3 = "I                   Love                you"

sentence1_modified = re.sub(r"\d","",sentence1) #replaces digits with empty string
sentence1_modified

'Welcome to the year '

In [151]:
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence2_modified = re.sub(r"[%@#~+\-\.']","",sentence2) #replaces @#~+-. with empty string
sentence2_modified = re.sub(r"\s+"," ",sentence2_modified)
sentence2_modified

'Just arrived at Jacks place fun'

In [152]:
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence2_modified = re.sub(r"[\w]"," ",sentence2) #replaces a-zA-Z0-9 with empty string
sentence2_modified

"     ~% +++---            @    '       . #   "

In [153]:
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence2_modified = re.sub(r"[\W]"," ",sentence2) #replaces "not a word char" with empty string
sentence2_modified

'Just           arrived at  Jack s place   fun'

In [154]:
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence2_modified = re.sub(r"\s+"," ",sentence2_modified) #replaces one or more space with a single space
sentence2_modified

'Just arrived at Jack s place fun'

In [155]:
sentence2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sentence2_modified = re.sub(r"\s+[a-zA-Z]\s+"," ",sentence2_modified) #replaces _s_ with single space
sentence2_modified

'Just arrived at Jack place fun'

In [156]:
sentence3 = "I                   Love                you"
sentence3_modified = re.sub(r"\s+"," ", sentence3) #replace spaces with one space
sentence3_modified

'I Love you'

In [157]:
sentence3 = "I                   Love                you"
sentence3_modified = re.sub(r"\s+Love\s+"," hate ", sentence3) #replace Love with hate
sentence3_modified

'I hate you'

<a id='1'></a>
## Preprocessing using Regex

In [158]:
X = ["This is a wolf #scary",
	"Welcome to the jungle #missing",
	"11322 the number to know",
	"Remember the name s - John",
	"I           Love           you"]
	
for i in range(0, len(X)):
	X[i] = re.sub(r"\W"," ",X[i])  #remove all non-word characters
	X[i] = re.sub(r"\d"," ",X[i]) #remove all digits
	X[i] = re.sub(r"\s+[a-z]\s+"," ", X[i], flags=re.I) #remove all single characters
	X[i] = re.sub(r"\s+"," ", X[i])
	X[i] = re.sub(r"^\s","", X[i])
	X[i] = re.sub(r"\s$","", X[i])
X

['This is wolf scary',
 'Welcome to the jungle missing',
 'the number to know',
 'Remember the name John',
 'I Love you']

<a id='2'></a>
## Intro to re Module

In [159]:
#define our phone number regex
phone_pattern = re.compile(r'\d{3} \d{3}-\d{4}')

In [160]:
#search a string with our regex
res = phone_pattern.search('Call me at 415 555-4242!') #first match
res.group()

'415 555-4242'

In [161]:
res = phone_pattern.findall('Call me at 415 555-4242! or 310 234-9999') #all matches
res

['415 555-4242', '310 234-9999']

<a id='3'></a>
## Validating Phone Numbers with Python

In [162]:
def extract_phone(input):
    phone_regex = re.compile(r'\b\d{3} \d{3}-\d{4}\b')
    match = phone_regex.search(input)
    if match:
        rerturn match.groupo()
    return None

print(extract_phone("my number is 432 567 8976"))
print(extract_phone("my number is 432 567 897622"))
print(extract_phone("my number is 432 567 897622 asdjhasd"))
print(extract_phone("432 567 897622"))

SyntaxError: invalid syntax (<ipython-input-162-a24e6b14e8ec>, line 5)

In [None]:
def extract_all_phone(input):
	phone_regex = re.compile(r'\b\d{3} \d{3}-\d{4}\b')
	return phone_regex.findall(input)

print(extract_all_phone("my number is 432 567 8976 or call me at 345 666-7899"))
print(extract_all_phone("my number is 432 567 89")

In [None]:
# def is_valid_phone(input):
#     phone_regex = re.compile(r'^\d{3} \d{3}-\d{4}$')
#     match = phone_regex.search(input)
#     if match:
#         return True
#     return False

def is_valid_phone(input):
    phone_regex = re.compile(r'\d{3} \d{3}-\d{4}')
    match = phone_regex.fullmatch(input)
    if match:
        return True
    return False

print(is_valid_phone("432 567-8976"))
print(is_valid_phone("432 567-8976 ads"))
print(is_valid_phone("432 567-8976 d"))

In [None]:
# Don't forget to import re!
#The time must start with a digit, and there can be a second optional digit before the colon.  
#Then there's the colon and two mandatory digits.  I used ^ and $ to make sure the time was the only thing in the input string.

import re
# Define is_valid_time below:
# def is_valid_time(input):
#     time_regex = re.compile(r'^(\d{1}|\d{2}):\d{2}$')
#     match = time_regex.search(input)
#     if match:
#         return True
#     return False


def is_valid_time(input):
    pattern = re.compile(r'^\d\d?:\d\d$')
    match = pattern.search(input)
    if match:
        return True
    return False

print(is_valid_time("10:45"))
print(is_valid_time("1:23"))
print(is_valid_time("18.45"))
print(is_valid_time("145:23"))
print(is_valid_time("it is 12:15"))

<a id='4'></a>
## Parsing URLs

In [None]:
url_regex = re.compile(r'(https?)://(www\.[A-Za-z-]{2,256}\.[a-z]{2,6})([-a-zA-Z0-9@;%_\+.~#&//=]*)')
match = url_regex.search("http://www.youtube.com/videos/asd/das/asd")
print(match.group())
print(match.group(0))
print(f"Protocal: {match.group(1)}")
print(f"Domain: {match.group(2)}")
print(f"Everyting Else: {match.group(3)}")
print(match.groups())

In [None]:
match = url_regex.search("http://www.my-website.com/bio?data=blah&cat=yes")
print(match.group())
print(match.group(0))
print(f"Protocal: {match.group(1)}")
print(f"Domain: {match.group(2)}")
print(f"Everyting Else: {match.group(3)}")
print(match.groups())

<a id='5'></a>
## Parsing Binary

In [None]:
#My regex looks like this: '\b[10]{8}\b'   It consists of eight 1s or 0s, surrounded by word boundaries on either side.  
#Remember a word boundary is either a space or the start/end of a line.

I then used findall  rather than search, to return a list of all matches.

# define parse_bytes below:
def parse_bytes(input):
    binary_regex = re.compile(r'\b\[10]{8}\b')
    results = binary_regex.findall(input)
    return results

<a id='6'></a>
## Symbolic Group Names

In [None]:
def parse_name(input):
	name_regex = re.compile(r'^(Mr\.|Mrs\.|Ms\.|Mdme\.) ([A-Za-z]+) ([A-Za-z]+)$')
	matches = name_regex.search(input)
	print(matches.groups())

parse_name("Mrs. Tilda Swinton")

In [None]:
# def parse_first_name(input):
# 	name_regex = re.compile(r'^(Mr\.|Mrs\.|Ms\.|Mdme\.) ([A-Za-z]+) ([A-Za-z]+)$')
# 	matches = name_regex.search(input)
# 	print(matches.group(2))

def parse_name_pieces(input):
	name_regex = re.compile(r'^(Mr\.|Mrs\.|Ms\.|Mdme\.) (?P<first>[A-Za-z]+) (?P<last>[A-Za-z]+)$')
	matches = name_regex.search(input)
	print(matches.group())
	print(matches.group('first'))
	print(matches.group('last'))
    
parse_name_pieces("Mrs. Tilda Swinton")

<a id='7'></a>
## Date Parsing

In [None]:
#define parse_date below
def parse_date(input):
    date_regex = re.compile(r'^([0-9]{2})[.,/]([0-9]{2})[.,/]([0-9]{4})$')
    match = date_regex.search(input)
    print(f"m: {match.group(1)}")
    print(f"d: {match.group(2)}")
    print(f"y: {match.group(3)}")

parse_date("12,04,2003")

In [None]:
# define parse_date below
def parse_date(input):
    date_regex = re.compile(r'^(\d\d)[\.,/](\d\d)[\.,/](\d\d\d\d)$')
    match = date_regex.search(input)
    if match:
        return {
            "d": match.group(1),
            "m": match.group(2),
            "y": match.group(3),
        }
    return None

parse_date("12,04,2003")

<a id='8'></a>
## REGEX Compilation Flags

In [None]:
pat = re.compile(r'^([a-z0-9_\.-]+)@([0-9a-z\,-]+)\.([a-z\.]{2,6})$')

In [None]:
pattern = re.compile(r"""
	^([a-z0-9_\.-]+) #username
	@				#single at sign
	([0-9a-z\,-]+)	#e-mail provider
	\.				#single period
	([a-z\.]{2,6})$	#com, org, net, etc.
""", re.X)

match = pattern.search("thomas123@yahoo.com")
print(match.group())
print(match.groups())

In [None]:
#re.VERBOSE | re.IGNORECASE
pattern = re.compile(r"""
    ^([a-z0-9_\.-]+) #username
    @				#single at sign
    ([0-9a-z\,-]+)	#e-mail provider
    \.				#single period
    ([a-z\.]{2,6})$	#com, org, net, etc.
""", re.VERBOSE | re.IGNORECASE)

match = pattern.search("thomas123@yahoo.com")
print(match.group())
print(match.groups())

In [None]:
#re.X | re.I
pattern = re.compile(r"""
	^([a-z0-9_\.-]+) #username
	@				#single at sign
	([0-9a-z\,-]+)	#e-mail provider
	\.				#single period
	([a-z\.]{2,6})$	#com, org, net, etc.
""", re.X | re.I)

match = pattern.search("thomas123@yahoo.com")
print(match.group())
print(match.groups())

<a id='9'></a>
## REGEX Substitution Basics

In [None]:
text = "Last night Mrs. Daisy and Mr. White murdered Ms. Chow"

pattern = re.compile(r'(Mr\.|Mrs\.|Ms\.) ([a-z])[a-z]+', re.I)
print(pattern.findall(text))
print(pattern.search(text).group())

In [None]:
pattern = re.compile(r'(Mr\.|Mrs\.|Ms\.) ([a-z])[a-z]+', re.I)
result = pattern.sub("REDACTED", text)
print(result)

In [None]:
pattern = re.compile(r'(Mr\.|Mrs\.|Ms\.) ([a-z])[a-z]+', re.I)
result = pattern.sub("\g<1> \g<2>", text)
print(result)

<a id='10'></a>
## REGEX Profanity Filter

In [None]:
def censor(text):
    pattern = re.compile(r'[a-z]{5,8}', re.I)
    result = pattern.sub("CENSORED", text)
    return result

In [None]:
censor("Frack you")

In [None]:
censor("I hope you fracking die")

In [None]:
#It looks for a word boundary and then the letters "frack" and then optionally more word characters afterwards, and 
#then a word boundary again.  I used the word boundaries to prevent false positives if the "frack" occurs in the middle of 
#another word.s

def censor(input):
    pattern = re.compile(r'\bfrack\w*\b', re.IGNORECASE)
    return pattern.sub("CENSORED", input)

In [None]:
censor("Frack you")

In [None]:
censor("I hope you fracking die")

<a id='11'></a>
## Swapping File Names

In [None]:
titles = [
	"Significant Others (1987)",
	"Tales of the City (1978)",
	"The Days of Anna Madrigal (2014)",
	"Mary Ann in Autumn (2010)",
	"Further Tales of the City (1982)",
	"Babycakes (1984)",
	"More Tales of the City (1980)",
	"Sure of You (1989)",
	"Michael Tolliver Lives (2007)"
]
titles.sort()
print(titles)

In [None]:
pattern = re.compile(r'(?P<title>^[\w ]+) \((?P<date>\d{4})\)')
result = pattern.sub("\g<date> - \g<title>", titles[0])
print(result)

In [None]:
titles.sort()
fixed_titles = []

for book in titles:
	result = pattern.sub("\g<2> - \g<1>", book)
	fixed_titles.append(result)
fixed_titles.sort()
print(fixed_titles)