In [1]:
# first we will import re module
import re

In [3]:
# match() checks for A MATCH AT THE BEGINNING and returns a boolean
# search() checks for a MATCH ANYWHERE and returns a boolean

text = "This is a good day"

if re.search("good", text):  # first parameter is the pattern
    print("Wonderful")
else:
    print("Alas!")
       

Wonderful


In [4]:
# Tokenizing: Splitting the string into substring based on patterns [It is a core part of NLP]
# findall() and split() parses thru strings and returns chunks
text = "Amy works diligently. Amy gets good grades. Our student Amy is successful"
# let's split this on all instances of Amy
re.split("Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is successful']

In [5]:
# If we want to count how many times Amy appears in the string, we use findall()
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [6]:
# Anchors: Anchors specify start and/or end of the string that you are trying to match. ^ means start and $ means end.

text = "Amy works diligently. Amy gets good grades. Our student Amy is successful"

# Lets see if this begins with Amy
re.search("^Amy", text)

<re.Match object; span=(0, 3), match='Amy'>

In [7]:
# this returns a new object, called re.Match object. An re.Match object always has a boolean value of True. So you can always
# evaluate it in an if statement as we did earlier.

# Patterns and Character classes

In [8]:
grades = "AAAAAAAAAAAAAACCCCCBBBAAAA"
re.findall("B", grades)

['B', 'B', 'B']

In [9]:
# If we want to count the number of A's or B's, we use [AB] as pattern, not AB, because AB means A's followed by B only
re.findall("[AB]", grades)

['A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'B',
 'B',
 'B',
 'A',
 'A',
 'A',
 'A']

In [13]:
# This is called set operator, you can also use a range of characters. Lowercase pattern: [a-z], Uppercase pattern: [A-Z]
re.findall("[A][BC]",grades)

['AC']

In [14]:
# | signifies or
re. findall("AB|AC", grades)

['AC']

In [15]:
# ^ can be used as NOT operator
re.findall("[^A]", grades)

['C', 'C', 'C', 'C', 'C', 'B', 'B', 'B']

In [16]:
# Inside a set operator , i.e., [], caret means not
re.findall("^[^A]", grades)  # find all strings that start with not A

[]

In [17]:
# This is an empty list because in our string, it only starts with A

# Quantifiers

In [18]:
# Quantifiers are the number of times you want a pattern to be matched for it to count as a match. 
# The most basic quantifier is e{m,n}, where e is what we are matching, m is the minimum number of matches, n is the maximum

In [20]:
re.findall("A{2,100}", grades)  # finds the times when the student has between 2 to 100 back-to-back A's

['AAAAAAAAAAAAAA', 'AAAA']

In [21]:
re.findall("A{2,2}", grades)

['AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA']

In [22]:
# if you do e{m, n} instead of e{m,n}, then it will return an empty list
# if you dont put a quantifier, the default is {1,1}
re.findall("AA", grades)

['AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA']

In [23]:
# one argument as quantifier means it is both m and n
re.findall("A{2}", grades)

['AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA']

In [24]:
# Using this we can find a decreasing trend in student's grades

In [26]:
re.findall("A{1,100}B{1,100}C{1,100}",grades)

[]

In [27]:
# shorthand quantifiers:
# an asterisk * to match 0 or more times
# question mark ? to match one or more times
# plus sign + to match one or more times
# let's look at a more complex example, and load some data scraped from wikipedia
with open("ferpa.txt","r") as file:
    # we'll read this in a variable called wiki
    wiki = file.read()
# let's print that variable out to the screen
wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [28]:
# Note that all headers say [edit] after it followed by newline character.
# So for a list of all the headers, we can do so using findall()
re.findall("[a-zA-Z]{1,1000}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [29]:
# we can use \w match any letter, including digits and number
re.findall("[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [31]:
# \s matches all whitespace
# * means 0 or more
# you can also improve by adding a space after \w, as [\w ], this means it will look for all letter digits AND  spaces
re.findall("[\w ]*\[edit\]",wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [34]:
for title in re.findall("[\w ]*\[edit\]",wiki):
    # Now we will take the intermediate result, split on the square bracket [ just taking the first result
    print(re.split("[\[]",title)[0])

Overview
Access to public records
Student medical records


# Groups

In [37]:
re.findall("([\w ]*)(\[edit\])",wiki)  
# ([\w ]*) means any words or letters followed by space any number of times
# (\[edit\]) means the characters [edit]. \ in the beginning of (\[edit\]) means it will match for [edit]. The last \ inside []
# denotes it is not a set

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [38]:
# findall() returns string, search() and match() returns match objects. If we want match objects as outputs, we use .finditer()
for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.groups())
    # groups for specific match item

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [40]:
# groups() returns a tuple of the group, we can get individual group using group(number), where group(0) is thw whole match
# and each other number is the portion of the match we are interested in
for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [45]:
# for labeling groups, we use (?P<name>), where () means group, ?P means extension to basic regexes, and <name>
# is the dictionary key we want to use wrapped in <>.
for items in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])",wiki):
    print(item.groupdict()["title"])


KeyError: 'title'

# Look-ahead and Look-behind

In [46]:
for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])",wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


In [49]:
with open("buddhist.txt") as file:
    wiki = file.read()

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1589: character maps to <undefined>