# Regular Expressions

In [None]:
#Regular Expressions
import re

# I/O
var = open('file location', encoding='utf-8')

# Read, Closing
var.read()
var.close()

with open("some_file.txt") as open_file:
    data = open_file.read()

# Match, Search, Findall

In [3]:
import re

pattern = r'tesseradecades' #raw string
pattern2 = r'phyllophyllin'
sequence = 'tesseradecades-septuplet-phyllophyllinnn-phyllophyllin'

In [4]:
#Matching the beginnning of a string
re.match(pattern, sequence).group() #Returns a Match object, or 'None' if no matches were found, group() returns the results

'tesseradecades'

In [5]:
#Searching anywhere in the text...returns first occurance
re.search(pattern2, sequence).group()

'phyllophyllin'

In [6]:
#Findall finds all the possible matches in the entire sequence and returns them as a list of strings. Each returned string represents one match.
re.findall(pattern2, sequence)

['phyllophyllin', 'phyllophyllin']

# Flags

In [7]:
# re.I                    ignore case
# re.VERBOSE              same thing, re.X...that allow regular expressions to span multiple lines and contain (ignored) whitespace and comments
# re.I | re.X             use the 'pipe' symbol to utilize more than one flag
# re.MULTILINE or re.M -  flag to make a pattern regard lines in your text as the beginning or end of a string.

# Add verbose (for multiline quote), and multiline (for muliple lines in string)

# Special Characters

In [8]:
# '.' matches any single character except newline character.
# '\w'  Matches any single letter, digit or underscore.
# '\W'  Matches character not in lowercase w.
# '\d' digits 0-9
# '[abc]' matches 'a' or 'b' or 'c'...matches one character
# '[a-zA-Z0-9]' all caps, all lowercase, all digits
# '\s' space, tab, newline, return
# '\S' not in 's'

# '\t' tab
# '\n' newline
# '\r' return 

# '^' matches the characters after the '^' at the beginning of the string
# '$' matches the characters before the '$' at the end of the string
# '\A' matches only at the start of the string. Works across multiple lines as well.
# '\b' matches only the beginning or end of the word
# '\B' #anyhtng that is not the edges of a string

pattern = r'Look! There is a tesseradecade eggs!'
print(re.search( r'^Look', pattern).group())
print(re.search( r'eggs!$', pattern).group())
print(re.search( r'\ALook', pattern).group())
print(re.search( r'\bLo', pattern).group())
print(re.search( r'ok\b', pattern).group())

Look
eggs!
Look
Lo
ok


# Repetitions

In [9]:
# '*' {0, }      left char. is OPTIONAL and greedy
# '+' {1, }   left char. is REQUIRED and greedy
# '?' {0, 1} left char. is optional and NOT greedy

# {x} Repeat exactly x number of times
# {x,} Repeat at least x times or more
# {x, y} or {, y} Repeat at least x times but no more than y times

# Sets
# [aple] #Matches a,p,l or e, and includes duplicates...this would find 'apple'
# [a-z]
# [A-Z]
# [a-zA-Z]
# [0-9]
# [^234] #do not search '2' '3' nor '4'
# [\w+] #this will include '+' symbol

# Grouping

In [10]:
# ([abc]) creates a group that contains a set and accessed from the Match object as .group(1)
# (?P<name>[abc]) creates a named group and is accessed from the Match object as .group('name')...only returns one result
# Works with search() and match() but not with findall()...only brings first result
# .groups() method to show all of the groups on a Match object. 

email_address = 'Please contact us at: support@datacamp.com, xyz@datacamp.com'
match1 = re.search(r'(?P<address>[\w.-]+)@(?P<Domain>[\w.-]+)', email_address)

print(match1.group('address'))
print(match1.group('Domain'))
print(' ')

#For all results
email_address = 'Please contact us at: support@datacamp.com, xyz@datacamp.com'
match2 = re.findall(r'[\w.-]+@[\w.-]+', email_address)

for i in match2:
    print('Using findall():' + str(i))

support
datacamp.com
 
Using findall():support@datacamp.com
Using findall():xyz@datacamp.com


In [11]:
#Non-greedy matches

#Greedy
heading  = r'<h1>TITLE</h1>'
print(re.match(r'<.*>', heading).group())

#Nongreedy
heading  = r'<h1>TITLE</h1>'
print(re.match(r'<.*?>', heading).group())

<h1>TITLE</h1>
<h1>


In [12]:
#Findall
#re.findall(pattern, string, flags=0)
email_address = "Please contact us at: support@datacamp.com, xyz@datacamp.com"
addresses = re.findall(r'[\w\.-]+@[\w\.-]+', email_address)
for address in addresses: 
    print(address)

#re.sub(pattern, repl, string, count=0, flags=0)
email_address = "Please contact us at: xyz@datacamp.com"
new_email_address = re.sub(r'([\w\.-]+)@([\w\.-]+)', r'support@datacamp.com', email_address)
print(new_email_address)

#re.compile(pattern, flags=0)
pattern = re.compile(r"cookie")
sequence = "Cake and cookie"
pattern.search(sequence).group()

#This is equivalent to:
re.search(pattern, sequence).group()

#Compiles a regular expression pattern into a regular expression object. 
#When you need to use an expression several times in a single program, 
#using the compile() function to save the resulting regular expression object for reuse is more efficient. 
#This is because the compiled versions of the most recent patterns passed to compile() and the module-level matching functions are cached.

support@datacamp.com
xyz@datacamp.com
Please contact us at: support@datacamp.com


'cookie'

In [16]:
# Compiling and Loops
# .groupdict() - method to generate a dictionary from a Match object's groups. The keys will be the group names. The values will be the results of the patterns in the group.
# re.finditer() - method to generate an iterable from the non-overlapping matches of a regular expression. Very handy for for loops.
# .group() - method to access the content of a group. 0 or none is the entire match. 1 through how ever many groups you have will get that group. Or use a group's name to get it if you're using named groups.

In [14]:
import re

def find_words(count, string):
    match = re.findall(r'\w{' + str(count) + ',}', string)
    return match

find_words(1, "dog, cat, baby, balloon, me")

['dog', 'cat', 'baby', 'balloon', 'me']

In [15]:
import re

file = open('/Users/pmcorrea/Notebooks/Resources/The Crazy Ones.txt')
text = file.read()
file.close()

pattern = r'the .......................'
results = re.findall(pattern, text, re.I)
results

['the crazy ones. The misfits',
 'The rebels. The troublemake',
 'The round pegs in the squar',
 'The ones who see things dif',
 'the status quo. You can quo',
 "the only thing you can't do",
 'the human race forward. And',
 'the crazy ones, we see geni',
 'the people who are crazy en',
 'the world, are the ones who']